diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000..2d40fc6 --- /dev/null +++ b/.dockerignore @@ -0,0 +1,18 @@ +.git +.venv +.pytest_cache +__pycache__ +**/__pycache__ +*.pyc +.env +data +dbt/target +dbt/dbt_packages +dbt/logs +dbt/.user.yml +airflow/logs +outputs +work +eventsim/data +eventsim/target +images diff --git a/.env.example b/.env.example new file mode 100644 index 0000000..9f5cfdc --- /dev/null +++ b/.env.example @@ -0,0 +1,10 @@ +YANDEX_MUSIC_TOKEN= +STREAMIFY_DATA_DIR=data +STREAMIFY_RAW_DIR=data/raw/yamusic +STREAMIFY_DUCKDB_PATH=data/streamify.duckdb +STREAMIFY_REPORT_PATH=data/streamify_summary.md +STREAMIFY_SNAPSHOT_PATH=data/streamify_snapshot.json +STREAMIFY_RECOMMENDATIONS_DIR=data/recommendations +STREAMIFY_DBT_PROFILES_DIR=dbt +STREAMIFY_DASHBOARD_PORT=8501 +DBT_THREADS=1 diff --git a/.github/ISSUE_TEMPLATE/agent_task.yml b/.github/ISSUE_TEMPLATE/agent_task.yml new file mode 100644 index 0000000..df97652 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/agent_task.yml @@ -0,0 +1,41 @@ +name: Agent task +description: Track a multi-agent implementation task for Streamify. +title: "[Agent] " +labels: ["agent-task", "triage"] +body: + - type: dropdown + id: agent + attributes: + label: Agent lane + options: + - Repo/Build + - Yandex Ingestion + - Analytics/dbt + - Product/Dashboard + - QA/Integration + validations: + required: true + - type: textarea + id: objective + attributes: + label: Objective + description: State the product or engineering outcome, not only the code change. + placeholder: "Example: Make real-account ingestion observable and idempotent for playlist metadata." + validations: + required: true + - type: textarea + id: acceptance + attributes: + label: Acceptance checks + description: Commands, artifacts, or runtime behavior that prove completion. + placeholder: | + - make raw-contract + - make dbt-build + - make dashboard-smoke + validations: + required: true + - type: textarea + id: notes + attributes: + label: Notes and risks + description: API risk, privacy constraints, data quality assumptions, or dependencies. diff --git a/.github/ISSUE_TEMPLATE/data_quality.yml b/.github/ISSUE_TEMPLATE/data_quality.yml new file mode 100644 index 0000000..8e092fe --- /dev/null +++ b/.github/ISSUE_TEMPLATE/data_quality.yml @@ -0,0 +1,37 @@ +name: Data quality issue +description: Report a raw, dbt, dashboard, or readiness quality issue. +title: "[DQ] " +labels: ["data-quality", "triage"] +body: + - type: dropdown + id: layer + attributes: + label: Affected layer + options: + - Raw/Bronze + - Silver/dbt staging + - Gold marts + - Dashboard + - CI/Release + validations: + required: true + - type: textarea + id: symptom + attributes: + label: Symptom + description: What failed or looked wrong? + validations: + required: true + - type: textarea + id: evidence + attributes: + label: Evidence + description: Command output, table/model name, manifest counts, or screenshot notes. Do not paste tokens or raw private data. + validations: + required: true + - type: textarea + id: expected + attributes: + label: Expected behavior + validations: + required: true diff --git a/.github/ISSUE_TEMPLATE/product_request.yml b/.github/ISSUE_TEMPLATE/product_request.yml new file mode 100644 index 0000000..eb564e9 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/product_request.yml @@ -0,0 +1,28 @@ +name: Product request +description: Propose a user-facing analytics or workflow improvement. +title: "[Product] " +labels: ["product", "triage"] +body: + - type: textarea + id: user_value + attributes: + label: Product value + description: What decision or action should this help a listener take? + validations: + required: true + - type: textarea + id: data + attributes: + label: Data needed + description: Which Yandex Music metadata, marts, or dashboard views are involved? + validations: + required: true + - type: textarea + id: acceptance + attributes: + label: Acceptance checks + placeholder: | + - make product-answers-smoke + - make dashboard-smoke + validations: + required: true diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md new file mode 100644 index 0000000..0c7d3d1 --- /dev/null +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -0,0 +1,25 @@ +## Summary + +- + +## Product Value + +- + +## Data Engineering Impact + +- Raw/Bronze: +- Silver/dbt: +- Gold/dashboard: +- Privacy/security: + +## Checks + +- [ ] `make test` +- [ ] `make acceptance-local` +- [ ] `make acceptance-real` when changing real-account ingestion +- [ ] No `.env`, raw Yandex Music data, DuckDB files, or audio artifacts are tracked + +## Notes + +- diff --git a/.github/workflows/data-quality.yml b/.github/workflows/data-quality.yml index bd846fe..fb8eb27 100644 --- a/.github/workflows/data-quality.yml +++ b/.github/workflows/data-quality.yml @@ -7,7 +7,7 @@ on: - main jobs: - validate-dbt-quality-contract: + validate-local-product-contract: runs-on: ubuntu-latest steps: - name: Checkout repository @@ -18,14 +18,14 @@ jobs: with: python-version: "3.12" - - name: Validate dbt quality contract - run: python3 scripts/validate_dbt_quality.py + - name: Install local dependencies + run: | + python3 -m venv .venv + .venv/bin/python -m pip install --upgrade pip + .venv/bin/python -m pip install -r requirements.txt - - name: Compile Python files - run: python3 -m compileall -q airflow/dags spark_streaming scripts - - - name: Validate Airflow Compose config - run: cd airflow && GCP_PROJECT_ID=dummy GCP_GCS_BUCKET=dummy docker compose config --quiet - - - name: Validate Kafka Compose config - run: cd kafka && docker compose config --quiet + - name: Run local product acceptance checks + env: + YANDEX_MUSIC_TOKEN: "" + DBT_THREADS: "1" + run: make test diff --git a/.github/workflows/pages.yml b/.github/workflows/pages.yml new file mode 100644 index 0000000..e4cb2b1 --- /dev/null +++ b/.github/workflows/pages.yml @@ -0,0 +1,58 @@ +name: GitHub Pages + +on: + push: + branches: + - main + workflow_dispatch: + +permissions: + contents: read + pages: write + id-token: write + +concurrency: + group: pages + cancel-in-progress: true + +jobs: + build: + runs-on: ubuntu-latest + steps: + - name: Checkout repository + uses: actions/checkout@v6 + + - name: Set up Python + uses: actions/setup-python@v6 + with: + python-version: "3.12" + + - name: Install dependencies + run: | + python3 -m venv .venv + .venv/bin/python -m pip install --upgrade pip + .venv/bin/python -m pip install -r requirements.txt + + - name: Build sample product artifacts + env: + YANDEX_MUSIC_TOKEN: "" + DBT_THREADS: "1" + run: | + make acceptance-local + .venv/bin/python scripts/build_pages_site.py + + - name: Upload Pages artifact + uses: actions/upload-pages-artifact@v4 + with: + path: public + + deploy: + environment: + name: github-pages + url: ${{ steps.deployment.outputs.page_url }} + runs-on: ubuntu-latest + needs: build + steps: + - name: Deploy to GitHub Pages + id: deployment + uses: actions/deploy-pages@v4 diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml new file mode 100644 index 0000000..417fcec --- /dev/null +++ b/.github/workflows/release.yml @@ -0,0 +1,53 @@ +name: Release + +on: + push: + tags: + - "v*.*.*" + workflow_dispatch: + +permissions: + contents: write + +jobs: + release: + runs-on: ubuntu-latest + steps: + - name: Checkout repository + uses: actions/checkout@v6 + with: + fetch-depth: 0 + + - name: Set up Python + uses: actions/setup-python@v6 + with: + python-version: "3.12" + + - name: Install dependencies + run: | + python3 -m venv .venv + .venv/bin/python -m pip install --upgrade pip + .venv/bin/python -m pip install -r requirements.txt + + - name: Validate release candidate with sample metadata + env: + YANDEX_MUSIC_TOKEN: "" + DBT_THREADS: "1" + run: | + make test + .venv/bin/python scripts/build_pages_site.py + + - name: Package tracked source + run: | + mkdir -p dist + git archive --format=tar.gz --output "dist/streamify-${GITHUB_REF_NAME:-manual}.tar.gz" HEAD + tar -czf "dist/streamify-pages-${GITHUB_REF_NAME:-manual}.tar.gz" public + + - name: Create GitHub release + if: startsWith(github.ref, 'refs/tags/') + env: + GH_TOKEN: ${{ github.token }} + run: | + NOTES="docs/releases/${GITHUB_REF_NAME}.md" + if [ ! -f "$NOTES" ]; then NOTES="docs/releases/v0.1.0.md"; fi + gh release create "$GITHUB_REF_NAME" dist/* --notes-file "$NOTES" --title "Streamify ${GITHUB_REF_NAME}" diff --git a/.gitignore b/.gitignore index 5292519..ad1d930 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,14 @@ -logs/ \ No newline at end of file +logs/ +.env +.venv/ +__pycache__/ +*.pyc +.pytest_cache/ +data/* +!data/.gitkeep +*.duckdb +*.duckdb.wal +dbt/target/ +dbt/dbt_packages/ +dbt/logs/ +public/ diff --git a/.streamlit/config.toml b/.streamlit/config.toml new file mode 100644 index 0000000..e7f9632 --- /dev/null +++ b/.streamlit/config.toml @@ -0,0 +1,5 @@ +[browser] +gatherUsageStats = false + +[server] +headless = true diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..5e85bdf --- /dev/null +++ b/Makefile @@ -0,0 +1,154 @@ +PYTHON ?= python3 +VENV ?= .venv +VENV_PYTHON := $(VENV)/bin/python +VENV_DBT := $(VENV)/bin/dbt +VENV_STREAMLIT := $(VENV)/bin/streamlit +ENV_RUN := $(VENV_PYTHON) scripts/run_with_dotenv.py +DBT_PROFILES_DIR ?= dbt + +.PHONY: help setup token-help status ingest ingest-sample preflight dbt-deps dbt-build dashboard dashboard-smoke doctor report snapshot recommendations readiness readiness-real real-gate-smoke product-answers-smoke pages-site acceptance-local acceptance-real compose-smoke-local test up-local compose-check clean-local + +help: + @printf '%s\n' 'Streamify local Yandex Music self-analytics' + @printf '%s\n' '' + @printf '%s\n' 'First local run with deterministic sample metadata:' + @printf '%s\n' ' make setup' + @printf '%s\n' ' make acceptance-local' + @printf '%s\n' ' make dashboard' + @printf '%s\n' '' + @printf '%s\n' 'Real account run after setting YANDEX_MUSIC_TOKEN in .env:' + @printf '%s\n' ' make token-help' + @printf '%s\n' ' make status' + @printf '%s\n' ' make acceptance-real' + @printf '%s\n' ' make dashboard' + @printf '%s\n' '' + @printf '%s\n' 'Docker Compose local profile:' + @printf '%s\n' ' make up-local' + @printf '%s\n' ' make compose-smoke-local' + @printf '%s\n' '' + @printf '%s\n' 'Useful checks and exports:' + @printf '%s\n' ' make raw-contract Validate raw JSONL/manifest contracts' + @printf '%s\n' ' make dbt-build Build local DuckDB/dbt marts' + @printf '%s\n' ' make report Export markdown summary, JSON snapshot and CSV queues' + @printf '%s\n' ' make pages-site Build the static GitHub Pages site from safe local artifacts' + @printf '%s\n' ' make readiness Audit local product readiness' + @printf '%s\n' ' make test Run full local quality gate' + @printf '%s\n' ' make clean-local Remove generated local artifacts, preserve .env' + +setup: + $(PYTHON) -m venv $(VENV) + $(VENV_PYTHON) -m pip install --upgrade pip + $(VENV_PYTHON) -m pip install -r requirements.txt + $(MAKE) dbt-deps + +token-help: + @printf '%s\n' 'Streamify needs a ready Yandex Music OAuth token in .env:' + @printf '%s\n' ' YANDEX_MUSIC_TOKEN=...' + @printf '%s\n' '' + @printf '%s\n' 'The installed yandex-music client only accepts a token; it does not obtain one.' + @printf '%s\n' 'Use an external Yandex Music OAuth token helper, then paste the token into .env.' + @printf '%s\n' '' + @printf '%s\n' 'Known community helper:' + @printf '%s\n' ' https://github.com/MarshalX/yandex-music-token' + @printf '%s\n' '' + @printf '%s\n' 'After saving .env, run:' + @printf '%s\n' ' make preflight' + @printf '%s\n' ' make acceptance-real' + +status: + $(ENV_RUN) -- $(VENV_PYTHON) -m yamusic_ingest --status + +ingest: + $(ENV_RUN) -- $(VENV_PYTHON) -m yamusic_ingest + +ingest-sample: + $(ENV_RUN) -- $(VENV_PYTHON) -m yamusic_ingest --sample + +preflight: + $(ENV_RUN) -- $(VENV_PYTHON) -m yamusic_ingest --preflight + +dbt-deps: + $(ENV_RUN) --cwd dbt -- $(abspath $(VENV_DBT)) deps + +dbt-build: dbt-deps + GCP_PROJECT_ID=dummy $(ENV_RUN) --cwd dbt -- $(abspath $(VENV_DBT)) build --profiles-dir . --target local --select yamusic + +dashboard: + $(ENV_RUN) -- $(VENV_STREAMLIT) run dashboard/app.py + +dashboard-smoke: + $(ENV_RUN) -- $(VENV_PYTHON) scripts/smoke_dashboard_content.py + $(ENV_RUN) -- $(VENV_PYTHON) scripts/smoke_dashboard.py + +doctor: + $(ENV_RUN) -- $(VENV_PYTHON) scripts/doctor_yamusic_local.py + +report: + $(ENV_RUN) -- $(VENV_PYTHON) scripts/export_yamusic_summary.py + $(ENV_RUN) -- $(VENV_PYTHON) scripts/export_yamusic_snapshot.py + $(ENV_RUN) -- $(VENV_PYTHON) scripts/export_yamusic_recommendations.py + +snapshot: + $(ENV_RUN) -- $(VENV_PYTHON) scripts/export_yamusic_snapshot.py + +recommendations: + $(ENV_RUN) -- $(VENV_PYTHON) scripts/export_yamusic_recommendations.py + +readiness: + $(ENV_RUN) -- $(VENV_PYTHON) scripts/audit_yamusic_readiness.py + +readiness-real: + $(ENV_RUN) -- $(VENV_PYTHON) scripts/audit_yamusic_readiness.py --require-real + +real-gate-smoke: + $(ENV_RUN) -- $(VENV_PYTHON) scripts/smoke_real_gate.py + +product-answers-smoke: + $(ENV_RUN) -- $(VENV_PYTHON) scripts/smoke_product_answers.py + +pages-site: + $(ENV_RUN) -- $(VENV_PYTHON) scripts/build_pages_site.py + +raw-contract: + $(ENV_RUN) -- $(VENV_PYTHON) scripts/validate_yamusic_raw_contract.py + +acceptance-local: ingest-sample raw-contract dbt-build doctor report readiness dashboard-smoke + +acceptance-real: preflight ingest raw-contract dbt-build doctor report readiness-real dashboard-smoke + +compose-smoke-local: + $(ENV_RUN) -- $(VENV_PYTHON) scripts/smoke_compose_local.py + +test: + $(VENV_PYTHON) scripts/validate_dbt_quality.py + $(VENV_PYTHON) scripts/validate_yamusic_local.py + $(VENV_PYTHON) scripts/check_no_local_sensitive_artifacts.py + $(VENV_PYTHON) scripts/check_no_audio_artifacts.py + $(ENV_RUN) -- $(VENV_PYTHON) scripts/smoke_empty_yamusic_dbt.py + $(MAKE) acceptance-local + $(MAKE) product-answers-smoke + $(MAKE) real-gate-smoke + $(MAKE) pages-site + $(VENV_PYTHON) -m compileall -q airflow/dags spark_streaming scripts yamusic_ingest dashboard tests + $(VENV_PYTHON) -m pytest -q + cd airflow && GCP_PROJECT_ID=dummy GCP_GCS_BUCKET=dummy docker compose config --quiet + cd kafka && docker compose config --quiet + $(ENV_RUN) -- docker compose -f docker-compose.local.yml config --quiet + $(ENV_RUN) -- docker compose -f docker-compose.local.yml --profile local config --quiet + $(MAKE) compose-smoke-local + +up-local: + $(ENV_RUN) -- docker compose -f docker-compose.local.yml --profile local up --build + +compose-check: + cd airflow && GCP_PROJECT_ID=dummy GCP_GCS_BUCKET=dummy docker compose config --quiet + cd kafka && docker compose config --quiet + $(ENV_RUN) -- docker compose -f docker-compose.local.yml config --quiet + $(ENV_RUN) -- docker compose -f docker-compose.local.yml --profile local config --quiet + +clean-local: + rm -rf data/raw/yamusic data/raw/yamusic_empty data/raw/yamusic_empty_smoke data/processed + rm -rf data/streamify.duckdb data/streamify.duckdb.wal data/streamify_empty.duckdb data/streamify_empty.duckdb.wal + rm -rf data/streamify_empty_smoke.duckdb data/streamify_empty_smoke.duckdb.wal data/streamify_summary.md data/streamify_snapshot.json data/recommendations + rm -rf dbt/target dbt/logs dbt/dbt_packages + rm -rf public diff --git a/README.md b/README.md index beaa1ff..10b82b7 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,77 @@ # Streamify -A data pipeline with Kafka, Spark Streaming, dbt, Docker, Airflow, Terraform, GCP and much more! +Local-first music self-analytics for Yandex Music metadata, plus the original Kafka/Spark/Airflow/GCP streaming pipeline. + +Streamify now has two compatible tracks: + +- **Local product track**: ingest your Yandex Music metadata, build DuckDB/dbt marts, and open a Streamlit dashboard without cloud cost. +- **Legacy cloud engineering track**: keep the original Eventsim, Kafka, Spark Streaming, Airflow, dbt BigQuery and GCP architecture for portfolio-grade data engineering. + +The local product track stores metadata and derived analytics only. It does not download or store audio. + +## Local Yandex Music Self-Analytics + +Product value: turn your own Yandex Music library into a reproducible local lakehouse that answers practical questions about your listening taste and library shape: favorite artists and tracks, genre shifts, playlist overlap, repeated patterns, diversity, active periods, underrated tracks and playlists, local data quality, and what data is missing. + +First run without credentials: + +```bash +cp .env.example .env +make setup +make help +make status +make ingest-sample +make raw-contract +make dbt-build +make doctor +make report +make readiness +make dashboard-smoke +make dashboard +``` + +Then open the Streamlit URL printed by `make dashboard`. + +Run with your account metadata: + +```bash +cp .env.example .env +make token-help +# Get a Yandex Music OAuth token with an external helper, then set YANDEX_MUSIC_TOKEN in .env. +make acceptance-real +make dashboard +``` + +Local defaults: + +- command guide: `make help` +- token guide: `make token-help` +- raw metadata: `data/raw/yamusic/*.jsonl` +- local warehouse: `data/streamify.duckdb` +- local configuration: `.env` is loaded by the Python CLI/scripts and by `scripts/run_with_dotenv.py` for Makefile commands, so token and path overrides work without Make parsing token values. +- dbt target: `dbt build --profiles-dir . --target local --select yamusic` +- dbt packages: `make setup` and `make dbt-build` both run `dbt deps`, so a fresh checkout does not rely on ignored local `dbt/dbt_packages`. +- dbt local threads: `DBT_THREADS=1` by default for stable laptop/container runs; raise it explicitly if your environment is stable. +- local status: `make status` prints safe configuration/readiness hints without calling Yandex Music or printing token values. +- token preflight: `make preflight` checks real Yandex Music API access without writing raw data or printing the token. +- dashboard: `streamlit run dashboard/app.py` +- dashboard smoke: `make dashboard-smoke` +- static self-analytics report: `make report`, written to `data/streamify_summary.md` +- structured self-analytics snapshot: `make snapshot`, written to `data/streamify_snapshot.json` for automation and downstream agent workflows. +- spreadsheet action queues: `make recommendations`, written to `data/recommendations/*.csv` for rediscovery, playlist cleanup, standout playlists, top artists and genre shifts. +- static GitHub Pages site: `make pages-site`, generated into ignored `public/` from docs and safe sample/report artifacts. +- readiness audit: `make readiness`, which verifies raw counts, DuckDB marts, report, no audio artifacts and whether the latest run is sample or real Yandex Music metadata. +- local acceptance check: `make doctor` +- real-account acceptance: `make acceptance-real`, which also runs `make readiness-real` and fails unless the latest manifest source is `yandex_music`. +- safety guard: `scripts/check_no_local_sensitive_artifacts.py` keeps root `.env`, Yandex raw data, DuckDB files and local audio out of git. +- raw schema contract: `make raw-contract` +- Docker Compose smoke: `make compose-smoke-local` +- one-command container path: `make up-local`, which loads `.env` through `scripts/run_with_dotenv.py` and runs Docker Compose with the `local` profile. It uses real Yandex Music metadata when `YANDEX_MUSIC_TOKEN` is present in `.env`, otherwise it writes deterministic sample metadata. +- local reset: `make clean-local` removes generated raw metadata, DuckDB databases, summary/snapshot/recommendations reports, dbt target/logs/packages, and smoke-test artifacts while preserving `.env` and source files. + +See [docs/yandex_music_local.md](docs/yandex_music_local.md) for the local architecture, token handling, and limitations. See [docs/yamusic_lineage.md](docs/yamusic_lineage.md) for raw-to-dashboard lineage and model ownership. See [docs/product_acceptance.md](docs/product_acceptance.md) for the requirement-to-command acceptance matrix. + +GitHub delivery is managed through issue templates, a PR checklist, sample-data CI, GitHub Pages, and tag-based releases. See [docs/project_management.md](docs/project_management.md) and [docs/release_process.md](docs/release_process.md). ## Слой Качества Данных diff --git a/dashboard/actions.py b/dashboard/actions.py new file mode 100644 index 0000000..7bffca4 --- /dev/null +++ b/dashboard/actions.py @@ -0,0 +1,61 @@ +from __future__ import annotations + +from collections.abc import Mapping +from typing import Any + + +def safe_int(value: Any) -> int: + try: + if value is None: + return 0 + return int(value) + except (TypeError, ValueError): + return 0 + + +def safe_float(value: Any) -> float: + try: + if value is None: + return 0.0 + return float(value) + except (TypeError, ValueError): + return 0.0 + + +def build_data_next_actions(profile: Mapping[str, Any]) -> list[str]: + actions: list[str] = [] + source = str(profile.get("manifest_source") or "unknown") + total_tracks = safe_int(profile.get("total_tracks")) + known_genres = safe_int(profile.get("known_genres")) + stale = safe_int(profile.get("stale_ingestion_flag")) == 1 + liked_fetch_failures = safe_int(profile.get("diagnostic_liked_shortcuts_fetch_failed")) + playlist_track_fetch_failures = safe_int(profile.get("diagnostic_playlist_tracks_fetch_failed")) + playlist_track_missing_ids = safe_int(profile.get("diagnostic_playlist_tracks_missing_track_id")) + duplicate_liked_tracks = safe_int(profile.get("diagnostic_liked_tracks_duplicate_skipped")) + duplicate_playlist_tracks = safe_int(profile.get("diagnostic_playlist_tracks_duplicate_skipped")) + top_artist_concentration = safe_float(profile.get("top_artist_concentration")) + + if source != "yandex_music": + actions.append("Replace sample metadata with account metadata: set YANDEX_MUSIC_TOKEN in .env and run make acceptance-real.") + if total_tracks == 0: + actions.append("No library rows are available; run make status, then verify account visibility with make preflight.") + if stale: + actions.append("Refresh ingestion because stale_ingestion_flag is true; rerun make ingest and make dbt-build.") + if liked_fetch_failures > 0: + actions.append(f"Investigate partial liked-track hydration: {liked_fetch_failures} liked shortcuts failed to fetch.") + if playlist_track_fetch_failures > 0: + actions.append(f"Investigate partial playlist-track enrichment: {playlist_track_fetch_failures} playlist shortcuts failed to fetch.") + if playlist_track_missing_ids > 0: + actions.append(f"Inspect playlist metadata quality: {playlist_track_missing_ids} playlist rows had no stable track id.") + if duplicate_liked_tracks > 0 or duplicate_playlist_tracks > 0: + actions.append( + "Duplicate library rows were skipped during ingestion; review the Data Quality tab before comparing playlist overlap across runs." + ) + if total_tracks > 0 and known_genres == 0: + actions.append("Genre coverage is missing; use artist and playlist signals as the primary analytics views.") + if top_artist_concentration >= 0.5: + actions.append("Taste is concentrated around the top artist; use underrated tracks and genre views to find variety.") + if not actions: + actions.append("Data is ready for exploration; review rediscovery tracks, playlist overlap and genre shifts.") + + return actions diff --git a/dashboard/app.py b/dashboard/app.py new file mode 100644 index 0000000..ca8ec1b --- /dev/null +++ b/dashboard/app.py @@ -0,0 +1,422 @@ +from __future__ import annotations + +import os +import sys +from pathlib import Path + +import duckdb +import pandas as pd +import streamlit as st + +ROOT = Path(__file__).resolve().parents[1] +if str(ROOT) not in sys.path: + sys.path.insert(0, str(ROOT)) + +from yamusic_ingest.config import load_dotenv +from dashboard.actions import build_data_next_actions +from dashboard.filters import apply_track_filters + +load_dotenv(ROOT / ".env") +DB_PATH = Path(os.getenv("STREAMIFY_DUCKDB_PATH", "data/streamify.duckdb")) +REPORT_PATH = Path(os.getenv("STREAMIFY_REPORT_PATH", "data/streamify_summary.md")) +SNAPSHOT_PATH = Path(os.getenv("STREAMIFY_SNAPSHOT_PATH", "data/streamify_snapshot.json")) +RECOMMENDATIONS_DIR = Path(os.getenv("STREAMIFY_RECOMMENDATIONS_DIR", "data/recommendations")) + + +def safe_int(value: object) -> int: + return 0 if pd.isna(value) else int(value) + + +def safe_float(value: object) -> float: + return 0.0 if pd.isna(value) else float(value) + + +def percent_label(value: object) -> str: + return f"{safe_float(value) * 100:.1f}%" + + +def yes_no(value: object) -> str: + return "yes" if safe_int(value) else "no" + + +@st.cache_data(ttl=30) +def query(sql: str) -> pd.DataFrame: + with duckdb.connect(str(DB_PATH), read_only=True) as conn: + return conn.execute(sql).fetchdf() + + +def require_database() -> bool: + if DB_PATH.exists(): + return True + st.error("Local DuckDB database is missing.") + st.code("make ingest-sample\nmake dbt-build", language="bash") + return False + + +st.set_page_config(page_title="Streamify Self-Analytics", page_icon="♪", layout="wide") + +st.title("Streamify Self-Analytics") +st.caption("Local Yandex Music metadata analytics. Audio is not downloaded or stored.") + +if not require_database(): + st.stop() + +try: + profile = query("select * from yamusic_library_profile") +except Exception as exc: + st.error("The local marts are not ready yet. Run ingestion and dbt build first.") + st.code("make ingest-sample\nmake dbt-build", language="bash") + st.exception(exc) + st.stop() + +if profile.empty: + st.warning("No library data is available yet.") + st.stop() + +row = profile.iloc[0] +has_library_data = safe_int(row["total_tracks"]) > 0 +metric_cols = st.columns(5) +metric_cols[0].metric("Tracks", safe_int(row["total_tracks"])) +metric_cols[1].metric("Liked", safe_int(row["liked_tracks"])) +metric_cols[2].metric("Artists", safe_int(row["artists"])) +metric_cols[3].metric("Playlists", safe_int(row["playlists"])) +metric_cols[4].metric("Hours", safe_float(row["library_hours"])) + +source_cols = st.columns(3) +source_cols[0].metric("Source", str(row["manifest_source"])) +source_cols[1].metric("Raw tracks", safe_int(row["raw_tracks"])) +source_cols[2].metric( + "Manifest generated", + "missing" if pd.isna(row["manifest_generated_at"]) else str(row["manifest_generated_at"])[:19], +) +st.caption( + f"Ingestion adapter: {row['adapter_name']} {row['adapter_version']} " + f"using {row['client_library']} {'' if pd.isna(row['client_library_version']) else row['client_library_version']}" +) + +signal_cols = st.columns(5) +signal_cols[0].metric("Known genres", safe_int(row["known_genres"])) +signal_cols[1].metric("Active months", safe_int(row["active_months"])) +signal_cols[2].metric("Underrated tracks", safe_int(row["underrated_tracks"])) +signal_cols[3].metric("Underrated playlists", safe_int(row["underrated_playlists"])) +signal_cols[4].metric("Top artist concentration", percent_label(row["top_artist_concentration"])) + +if not has_library_data: + st.warning("No Yandex Music library metadata was returned for this run.") + st.code("make ingest\nmake dbt-build", language="bash") + +genre_options = query( + """ + select distinct coalesce(genre, 'unknown') as genre + from yamusic_dim_tracks + order by genre + """ +)["genre"].tolist() + +st.sidebar.header("Filters") +selected_genres = st.sidebar.multiselect("Genres", genre_options, default=genre_options) +liked_mode = st.sidebar.selectbox("Liked", ["All", "Liked", "Not liked"]) +track_search = st.sidebar.text_input("Search").strip().lower() + +tab_overview, tab_periods, tab_artists, tab_genres, tab_playlists, tab_tracks, tab_actions, tab_quality = st.tabs( + ["Overview", "Periods", "Artists", "Genres", "Playlists", "Tracks", "Actions", "Data Quality"] +) + +with tab_overview: + tracks = query( + """ + select title, artist_display, album_title, genre, liked, duration_seconds + from yamusic_dim_tracks + order by liked desc, title + limit 5000 + """ + ) + tracks = apply_track_filters(tracks, selected_genres, liked_mode, track_search) + st.subheader("Library snapshot") + st.metric("Filtered tracks", len(tracks.index)) + st.dataframe(tracks, use_container_width=True, hide_index=True) + +with tab_periods: + periods = query( + """ + select activity_month, event_count, liked_events, playlist_events, active_tracks, active_artists, active_genres + from yamusic_period_activity + order by activity_month + """ + ) + st.subheader("Activity periods") + if not periods.empty: + chart_data = periods.set_index("activity_month")[["event_count", "active_tracks", "active_artists"]] + st.line_chart(chart_data) + st.dataframe(periods, use_container_width=True, hide_index=True) + genre_periods = query( + """ + select activity_month, genre, event_count, active_tracks, event_share_in_month + from yamusic_genre_periods + order by activity_month, event_share_in_month desc, genre + """ + ) + st.subheader("Genre shifts") + if not genre_periods.empty: + genre_shift_chart = genre_periods.pivot( + index="activity_month", columns="genre", values="event_share_in_month" + ).fillna(0) + st.line_chart(genre_shift_chart) + genre_periods["event_share_in_month"] = genre_periods["event_share_in_month"].map(lambda value: f"{value * 100:.1f}%") + st.dataframe(genre_periods, use_container_width=True, hide_index=True) + +with tab_artists: + artists = query( + """ + select artist_name, track_count, liked_track_count, playlist_appearances + from yamusic_artist_affinity + order by track_count desc, playlist_appearances desc, artist_name + limit 30 + """ + ) + st.subheader("Artist affinity") + if not artists.empty: + top_artist = artists.iloc[0] + st.caption( + f"Top artist: {top_artist['artist_name']} with {safe_int(top_artist['track_count'])} tracks " + f"and {safe_int(top_artist['liked_track_count'])} liked tracks." + ) + st.bar_chart(artists.set_index("artist_name")["track_count"]) + st.dataframe(artists, use_container_width=True, hide_index=True) + +with tab_genres: + genres = query( + """ + select genre, track_count, liked_track_count, library_hours, track_share + from yamusic_genre_profile + order by track_count desc, genre + """ + ) + st.subheader("Genre diversity") + if not genres.empty: + st.bar_chart(genres.set_index("genre")["track_count"]) + genres["track_share"] = genres["track_share"].map(lambda value: f"{value * 100:.1f}%") + st.dataframe(genres, use_container_width=True, hide_index=True) + genre_periods = query( + """ + select activity_month, genre, event_share_in_month, event_count, active_tracks + from yamusic_genre_periods + order by activity_month desc, event_share_in_month desc, genre + """ + ) + st.subheader("Genre shifts by month") + if not genre_periods.empty: + genre_periods["event_share_in_month"] = genre_periods["event_share_in_month"].map(lambda value: f"{value * 100:.1f}%") + st.dataframe(genre_periods, use_container_width=True, hide_index=True) + +with tab_playlists: + playlists = query( + """ + select playlist_title, actual_track_count, unique_track_count, declared_track_count + from yamusic_dim_playlists + order by actual_track_count desc, playlist_title + """ + ) + st.subheader("Playlist coverage") + st.dataframe(playlists, use_container_width=True, hide_index=True) + playlist_signals = query( + """ + select playlist_title, uniqueness_ratio, max_overlap, overlapped_track_mentions, underrated_playlist_flag + from yamusic_playlist_signals + order by underrated_playlist_flag desc, uniqueness_ratio desc, playlist_title + """ + ) + st.subheader("Underrated playlist signals") + if not playlist_signals.empty: + playlist_signals["uniqueness_ratio"] = playlist_signals["uniqueness_ratio"].map(lambda value: f"{value * 100:.1f}%") + playlist_signals["max_overlap"] = playlist_signals["max_overlap"].map(lambda value: f"{value * 100:.1f}%") + playlist_signals["underrated_playlist_flag"] = playlist_signals["underrated_playlist_flag"].map(lambda value: "yes" if value else "no") + st.dataframe(playlist_signals, use_container_width=True, hide_index=True) + overlap = query( + """ + select playlist_a_title, playlist_b_title, overlap_track_count, jaccard_overlap + from yamusic_playlist_overlap + order by overlap_track_count desc, jaccard_overlap desc + limit 50 + """ + ) + st.subheader("Playlist overlap") + if not overlap.empty: + overlap["jaccard_overlap"] = overlap["jaccard_overlap"].map(lambda value: f"{value * 100:.1f}%") + st.dataframe(overlap, use_container_width=True, hide_index=True) + +with tab_tracks: + track_signals = query( + """ + select + title, + artist_display, + genre, + liked, + playlist_count, + event_count, + repeat_signal, + underrated_flag, + first_event_ts, + last_event_ts + from yamusic_track_signals + order by underrated_flag desc, repeat_signal desc, title + limit 5000 + """ + ) + track_signals = apply_track_filters(track_signals, selected_genres, liked_mode, track_search) + st.subheader("Repeated and underrated tracks") + st.metric("Filtered track signals", len(track_signals.index)) + if not track_signals.empty: + top_repeat = track_signals.sort_values(["repeat_signal", "playlist_count"], ascending=False).iloc[0] + st.caption( + f"Highest repeat signal: {top_repeat['title']} by {top_repeat['artist_display']} " + f"with score {safe_int(top_repeat['repeat_signal'])}." + ) + track_signals["liked"] = track_signals["liked"].map(lambda value: "yes" if value else "no") + track_signals["underrated_flag"] = track_signals["underrated_flag"].map(lambda value: "yes" if value else "no") + st.dataframe(track_signals, use_container_width=True, hide_index=True) + +with tab_actions: + st.subheader("Next actions") + action_profile = row.to_dict() + for action in build_data_next_actions(action_profile): + st.write(f"- {action}") + + rediscovery = query( + """ + select title, artist_display, genre, playlist_slots, playlist_count + from yamusic_track_signals + where underrated_flag = true + order by playlist_slots asc, playlist_count asc, title + limit 25 + """ + ) + st.subheader("Rediscovery queue") + st.dataframe(rediscovery, use_container_width=True, hide_index=True) + + cleanup = query( + """ + select playlist_a_title, playlist_b_title, overlap_track_count, jaccard_overlap + from yamusic_playlist_overlap + order by jaccard_overlap desc, overlap_track_count desc, playlist_a_title, playlist_b_title + limit 25 + """ + ) + if not cleanup.empty: + cleanup["jaccard_overlap"] = cleanup["jaccard_overlap"].map(lambda value: f"{value * 100:.1f}%") + st.subheader("Playlist cleanup candidates") + st.dataframe(cleanup, use_container_width=True, hide_index=True) + + standout_playlists = query( + """ + select playlist_title, actual_track_count, unique_track_count, uniqueness_ratio, max_overlap + from yamusic_playlist_signals + where underrated_playlist_flag = true + order by uniqueness_ratio desc, actual_track_count desc, playlist_title + limit 25 + """ + ) + if not standout_playlists.empty: + standout_playlists["uniqueness_ratio"] = standout_playlists["uniqueness_ratio"].map(lambda value: f"{value * 100:.1f}%") + standout_playlists["max_overlap"] = standout_playlists["max_overlap"].map(lambda value: f"{value * 100:.1f}%") + st.subheader("Standout playlists") + st.dataframe(standout_playlists, use_container_width=True, hide_index=True) + + export_cols = st.columns(2) + if REPORT_PATH.exists(): + export_cols[0].download_button( + "Download summary", + data=REPORT_PATH.read_text(encoding="utf-8"), + file_name=REPORT_PATH.name, + mime="text/markdown", + ) + if SNAPSHOT_PATH.exists(): + export_cols[1].download_button( + "Download snapshot", + data=SNAPSHOT_PATH.read_text(encoding="utf-8"), + file_name=SNAPSHOT_PATH.name, + mime="application/json", + ) + recommendation_files = sorted(RECOMMENDATIONS_DIR.glob("*.csv")) if RECOMMENDATIONS_DIR.exists() else [] + if recommendation_files: + st.subheader("Download action queues") + for path in recommendation_files: + st.download_button( + path.stem.replace("_", " ").title(), + data=path.read_text(encoding="utf-8"), + file_name=path.name, + mime="text/csv", + ) + +with tab_quality: + quality = { + "database": str(DB_PATH), + "manifest_source": str(row["manifest_source"]), + "manifest_generated_at": None if pd.isna(row["manifest_generated_at"]) else str(row["manifest_generated_at"]), + "manifest_raw_dir": str(row["manifest_raw_dir"]), + "manifest_json_only": bool(row["manifest_json_only"]), + "adapter": { + "adapter_name": str(row["adapter_name"]), + "adapter_version": str(row["adapter_version"]), + "client_library": str(row["client_library"]), + "client_library_version": None if pd.isna(row["client_library_version"]) else str(row["client_library_version"]), + }, + "ingestion_diagnostics": { + "liked_shortcuts_seen": safe_int(row["diagnostic_liked_shortcuts_seen"]), + "liked_tracks_written": safe_int(row["diagnostic_liked_tracks_written"]), + "liked_shortcuts_fetch_failed": safe_int(row["diagnostic_liked_shortcuts_fetch_failed"]), + "liked_shortcuts_missing_track_id": safe_int(row["diagnostic_liked_shortcuts_missing_track_id"]), + "liked_tracks_duplicate_skipped": safe_int(row["diagnostic_liked_tracks_duplicate_skipped"]), + "liked_albums_seen": safe_int(row["diagnostic_liked_albums_seen"]), + "liked_albums_written": safe_int(row["diagnostic_liked_albums_written"]), + "liked_albums_missing_id": safe_int(row["diagnostic_liked_albums_missing_id"]), + "liked_albums_duplicate_skipped": safe_int(row["diagnostic_liked_albums_duplicate_skipped"]), + "liked_artists_seen": safe_int(row["diagnostic_liked_artists_seen"]), + "liked_artists_written": safe_int(row["diagnostic_liked_artists_written"]), + "liked_artists_missing_id": safe_int(row["diagnostic_liked_artists_missing_id"]), + "liked_artists_duplicate_skipped": safe_int(row["diagnostic_liked_artists_duplicate_skipped"]), + "liked_playlists_seen": safe_int(row["diagnostic_liked_playlists_seen"]), + "liked_playlists_written": safe_int(row["diagnostic_liked_playlists_written"]), + "liked_playlists_missing_id": safe_int(row["diagnostic_liked_playlists_missing_id"]), + "liked_playlists_duplicate_skipped": safe_int(row["diagnostic_liked_playlists_duplicate_skipped"]), + "playlists_seen": safe_int(row["diagnostic_playlists_seen"]), + "playlists_written": safe_int(row["diagnostic_playlists_written"]), + "playlists_missing_id": safe_int(row["diagnostic_playlists_missing_id"]), + "playlist_fetch_fallbacks": safe_int(row["diagnostic_playlist_fetch_fallbacks"]), + "playlist_tracks_seen": safe_int(row["diagnostic_playlist_tracks_seen"]), + "playlist_tracks_written": safe_int(row["diagnostic_playlist_tracks_written"]), + "playlist_tracks_fetch_failed": safe_int(row["diagnostic_playlist_tracks_fetch_failed"]), + "playlist_tracks_missing_track_id": safe_int(row["diagnostic_playlist_tracks_missing_track_id"]), + "playlist_tracks_duplicate_skipped": safe_int(row["diagnostic_playlist_tracks_duplicate_skipped"]), + }, + "raw_counts": { + "tracks": safe_int(row["raw_tracks"]), + "artists": safe_int(row["raw_artists"]), + "albums": safe_int(row["raw_albums"]), + "playlists": safe_int(row["raw_playlists"]), + "playlist_tracks": safe_int(row["raw_playlist_tracks"]), + "user_library_events": safe_int(row["raw_user_library_events"]), + }, + "raw_checksums": { + "tracks": str(row["raw_tracks_sha256"]), + "artists": str(row["raw_artists_sha256"]), + "albums": str(row["raw_albums_sha256"]), + "playlists": str(row["raw_playlists_sha256"]), + "playlist_tracks": str(row["raw_playlist_tracks_sha256"]), + "user_library_events": str(row["raw_user_library_events_sha256"]), + }, + "calculated_at": str(row["calculated_at"]), + "top_artist_concentration": percent_label(row["top_artist_concentration"]), + "top_genre_share": percent_label(row["top_genre_share"]), + "playlist_track_slots": safe_int(row["playlist_track_slots"]), + "playlist_unique_tracks": safe_int(row["playlist_unique_tracks"]), + "busiest_month_events": safe_int(row["busiest_month_events"]), + "max_repeat_signal": safe_int(row["max_repeat_signal"]), + "last_ingested_at": None if pd.isna(row["last_ingested_at"]) else str(row["last_ingested_at"]), + "ingestion_age_hours": safe_int(row["ingestion_age_hours"]), + "stale_ingestion_flag": yes_no(row["stale_ingestion_flag"]), + } + st.subheader("Local data quality signals") + st.json(quality) + st.info("Run `make test` for schema, relationship, compile and compose checks.") diff --git a/dashboard/filters.py b/dashboard/filters.py new file mode 100644 index 0000000..0f1806d --- /dev/null +++ b/dashboard/filters.py @@ -0,0 +1,22 @@ +from __future__ import annotations + +import pandas as pd + + +def apply_track_filters(frame: pd.DataFrame, genres: list[str], liked_mode: str, search_text: str) -> pd.DataFrame: + filtered = frame.copy() + if "genre" in filtered.columns and genres: + filtered = filtered[filtered["genre"].fillna("unknown").isin(genres)] + if "liked" in filtered.columns and liked_mode == "Liked": + filtered = filtered[filtered["liked"] == True] # noqa: E712 + if "liked" in filtered.columns and liked_mode == "Not liked": + filtered = filtered[filtered["liked"] == False] # noqa: E712 + search = search_text.strip().lower() + if search: + searchable_columns = [column for column in ["title", "artist_display", "album_title"] if column in filtered.columns] + if searchable_columns: + mask = pd.Series(False, index=filtered.index) + for column in searchable_columns: + mask = mask | filtered[column].fillna("").str.lower().str.contains(search, regex=False) + filtered = filtered[mask] + return filtered diff --git a/data/.gitkeep b/data/.gitkeep new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/data/.gitkeep @@ -0,0 +1 @@ + diff --git a/dbt/dbt_project.yml b/dbt/dbt_project.yml index 5d27e27..7a9c584 100755 --- a/dbt/dbt_project.yml +++ b/dbt/dbt_project.yml @@ -33,8 +33,9 @@ clean-targets: # directories to be removed by `dbt clean` # using the `{{ config(...) }}` macro. models: streamify: - # Config indicated by + and applies to all files under models/example/ - staging: - +materialized: view core: - +materialized: table \ No newline at end of file + +materialized: table + yamusic: + +materialized: table + staging: + +materialized: view diff --git a/dbt/models/core/schema.yml b/dbt/models/core/schema.yml index 972e6f2..16b36ac 100644 --- a/dbt/models/core/schema.yml +++ b/dbt/models/core/schema.yml @@ -20,36 +20,41 @@ models: tests: - not_null - relationships: - to: ref('dim_users') - field: userKey + arguments: + to: ref('dim_users') + field: userKey - name: artistKey description: Surrogate key for the listened artist. tests: - not_null - relationships: - to: ref('dim_artists') - field: artistKey + arguments: + to: ref('dim_artists') + field: artistKey - name: songKey description: Surrogate key for the listened song. tests: - not_null - relationships: - to: ref('dim_songs') - field: songKey + arguments: + to: ref('dim_songs') + field: songKey - name: dateKey description: Hour-level datetime key. tests: - not_null - relationships: - to: ref('dim_datetime') - field: dateKey + arguments: + to: ref('dim_datetime') + field: dateKey - name: locationKey description: Surrogate key for event location. tests: - not_null - relationships: - to: ref('dim_location') - field: locationKey + arguments: + to: ref('dim_location') + field: locationKey - name: ts description: Original listening event timestamp. tests: @@ -71,13 +76,15 @@ models: description: User gender as produced by Eventsim. tests: - accepted_values: - values: ['M', 'F'] + arguments: + values: ['M', 'F'] - name: level description: Subscription level tracked as SCD2 attribute. tests: - not_null - accepted_values: - values: ['free', 'paid'] + arguments: + values: ['free', 'paid'] - name: rowActivationDate description: First date when this SCD2 row is active. tests: @@ -91,7 +98,8 @@ models: tests: - not_null - accepted_values: - values: [0, 1] + arguments: + values: [0, 1] - name: dim_songs description: Song dimension from Million Song Dataset seed/source data. @@ -166,7 +174,8 @@ models: tests: - not_null - accepted_values: - values: [true, false] + arguments: + values: [true, false] - name: wide_streams description: Dashboard-friendly denormalized view over fact_streams and core dimensions. diff --git a/dbt/models/yamusic/marts/yamusic_artist_affinity.sql b/dbt/models/yamusic/marts/yamusic_artist_affinity.sql new file mode 100644 index 0000000..da1dd84 --- /dev/null +++ b/dbt/models/yamusic/marts/yamusic_artist_affinity.sql @@ -0,0 +1,27 @@ +with track_artist as ( + select + tracks.track_id, + tracks.title, + tracks.liked, + unnest(tracks.artist_names) as artist_name + from {{ ref('yamusic_dim_tracks') }} as tracks +), + +playlist_presence as ( + select + track_id, + count(distinct playlist_id) as playlist_count + from {{ ref('yamusic_fact_playlist_tracks') }} + group by 1 +) + +select + artist_name, + count(distinct track_artist.track_id) as track_count, + sum(case when liked then 1 else 0 end) as liked_track_count, + coalesce(sum(playlist_presence.playlist_count), 0) as playlist_appearances, + round(avg(coalesce(playlist_presence.playlist_count, 0)), 2) as avg_playlist_appearances_per_track +from track_artist +left join playlist_presence using (track_id) +where artist_name is not null and artist_name != '' +group by 1 diff --git a/dbt/models/yamusic/marts/yamusic_dim_albums.sql b/dbt/models/yamusic/marts/yamusic_dim_albums.sql new file mode 100644 index 0000000..1ca3798 --- /dev/null +++ b/dbt/models/yamusic/marts/yamusic_dim_albums.sql @@ -0,0 +1,8 @@ +select + album_id, + album_title, + genre, + release_year, + source, + ingested_at +from {{ ref('stg_yamusic_albums') }} diff --git a/dbt/models/yamusic/marts/yamusic_dim_artists.sql b/dbt/models/yamusic/marts/yamusic_dim_artists.sql new file mode 100644 index 0000000..3c75594 --- /dev/null +++ b/dbt/models/yamusic/marts/yamusic_dim_artists.sql @@ -0,0 +1,6 @@ +select + artist_id, + artist_name, + source, + ingested_at +from {{ ref('stg_yamusic_artists') }} diff --git a/dbt/models/yamusic/marts/yamusic_dim_playlists.sql b/dbt/models/yamusic/marts/yamusic_dim_playlists.sql new file mode 100644 index 0000000..db879ce --- /dev/null +++ b/dbt/models/yamusic/marts/yamusic_dim_playlists.sql @@ -0,0 +1,19 @@ +with actual_counts as ( + select + playlist_id, + count(*) as actual_track_count, + count(distinct track_id) as unique_track_count + from {{ ref('stg_yamusic_playlist_tracks') }} + group by 1 +) + +select + playlists.playlist_id, + playlists.playlist_title, + playlists.declared_track_count, + coalesce(actual_counts.actual_track_count, 0) as actual_track_count, + coalesce(actual_counts.unique_track_count, 0) as unique_track_count, + playlists.source, + playlists.ingested_at +from {{ ref('stg_yamusic_playlists') }} as playlists +left join actual_counts using (playlist_id) diff --git a/dbt/models/yamusic/marts/yamusic_dim_tracks.sql b/dbt/models/yamusic/marts/yamusic_dim_tracks.sql new file mode 100644 index 0000000..9b0c465 --- /dev/null +++ b/dbt/models/yamusic/marts/yamusic_dim_tracks.sql @@ -0,0 +1,16 @@ +select + track_id, + title, + duration_ms, + round(duration_ms / 1000.0, 1) as duration_seconds, + album_id, + album_title, + genre, + release_year, + label, + artist_names, + array_to_string(artist_names, ', ') as artist_display, + liked, + source, + ingested_at +from {{ ref('stg_yamusic_tracks') }} diff --git a/dbt/models/yamusic/marts/yamusic_fact_library_events.sql b/dbt/models/yamusic/marts/yamusic_fact_library_events.sql new file mode 100644 index 0000000..eb7c496 --- /dev/null +++ b/dbt/models/yamusic/marts/yamusic_fact_library_events.sql @@ -0,0 +1,9 @@ +select + event_id, + event_type, + track_id, + playlist_id, + event_ts, + source, + ingested_at +from {{ ref('stg_yamusic_user_library_events') }} diff --git a/dbt/models/yamusic/marts/yamusic_fact_playlist_tracks.sql b/dbt/models/yamusic/marts/yamusic_fact_playlist_tracks.sql new file mode 100644 index 0000000..e5b783c --- /dev/null +++ b/dbt/models/yamusic/marts/yamusic_fact_playlist_tracks.sql @@ -0,0 +1,7 @@ +select + playlist_id, + track_id, + position, + source, + ingested_at +from {{ ref('stg_yamusic_playlist_tracks') }} diff --git a/dbt/models/yamusic/marts/yamusic_genre_profile.sql b/dbt/models/yamusic/marts/yamusic_genre_profile.sql new file mode 100644 index 0000000..36f1473 --- /dev/null +++ b/dbt/models/yamusic/marts/yamusic_genre_profile.sql @@ -0,0 +1,8 @@ +select + coalesce(genre, 'unknown') as genre, + count(*) as track_count, + sum(case when liked then 1 else 0 end) as liked_track_count, + round(sum(duration_ms) / 3600000.0, 2) as library_hours, + round(count(*) * 1.0 / nullif(sum(count(*)) over (), 0), 3) as track_share +from {{ ref('yamusic_dim_tracks') }} +group by 1 diff --git a/dbt/models/yamusic/marts/yamusic_library_profile.sql b/dbt/models/yamusic/marts/yamusic_library_profile.sql new file mode 100644 index 0000000..952f052 --- /dev/null +++ b/dbt/models/yamusic/marts/yamusic_library_profile.sql @@ -0,0 +1,143 @@ +with totals as ( + select + count(*) as total_tracks, + sum(case when liked then 1 else 0 end) as liked_tracks, + count(distinct album_id) as albums, + sum(duration_ms) / 3600000.0 as library_hours + from {{ ref('yamusic_dim_tracks') }} +), + +playlists as ( + select + count(*) as playlists, + sum(actual_track_count) as playlist_track_slots, + sum(unique_track_count) as playlist_unique_tracks + from {{ ref('yamusic_dim_playlists') }} +), + +artists as ( + select + count(*) as artists, + max(track_count) as top_artist_track_count + from {{ ref('yamusic_artist_affinity') }} +), + +genres as ( + select + count(*) filter (where genre != 'unknown') as known_genres, + max(track_share) as top_genre_share + from {{ ref('yamusic_genre_profile') }} +), + +track_signals as ( + select + sum(underrated_flag) as underrated_tracks, + max(repeat_signal) as max_repeat_signal + from {{ ref('yamusic_track_signals') }} +), + +periods as ( + select + count(*) as active_months, + max(event_count) as busiest_month_events + from {{ ref('yamusic_period_activity') }} +), + +playlist_signals as ( + select + sum(underrated_playlist_flag) as underrated_playlists + from {{ ref('yamusic_playlist_signals') }} +), + +freshness as ( + select + max(ingested_at) as last_ingested_at, + date_diff('hour', max(ingested_at), current_timestamp) as ingestion_age_hours + from {{ ref('yamusic_fact_library_events') }} +), + +manifest as ( + select * + from {{ ref('stg_yamusic_manifest') }} +) + +select + manifest.manifest_source, + manifest.manifest_generated_at, + manifest.manifest_raw_dir, + manifest.manifest_json_only, + manifest.adapter_name, + manifest.adapter_version, + manifest.client_library, + manifest.client_library_version, + manifest.diagnostic_liked_shortcuts_seen, + manifest.diagnostic_liked_tracks_written, + manifest.diagnostic_liked_shortcuts_fetch_failed, + manifest.diagnostic_liked_shortcuts_missing_track_id, + manifest.diagnostic_liked_tracks_duplicate_skipped, + manifest.diagnostic_liked_albums_seen, + manifest.diagnostic_liked_albums_written, + manifest.diagnostic_liked_albums_missing_id, + manifest.diagnostic_liked_albums_duplicate_skipped, + manifest.diagnostic_liked_artists_seen, + manifest.diagnostic_liked_artists_written, + manifest.diagnostic_liked_artists_missing_id, + manifest.diagnostic_liked_artists_duplicate_skipped, + manifest.diagnostic_liked_playlists_seen, + manifest.diagnostic_liked_playlists_written, + manifest.diagnostic_liked_playlists_missing_id, + manifest.diagnostic_liked_playlists_duplicate_skipped, + manifest.diagnostic_playlists_seen, + manifest.diagnostic_playlists_written, + manifest.diagnostic_playlists_missing_id, + manifest.diagnostic_playlist_fetch_fallbacks, + manifest.diagnostic_playlist_tracks_seen, + manifest.diagnostic_playlist_tracks_written, + manifest.diagnostic_playlist_tracks_fetch_failed, + manifest.diagnostic_playlist_tracks_missing_track_id, + manifest.diagnostic_playlist_tracks_duplicate_skipped, + manifest.raw_tracks, + manifest.raw_tracks_sha256, + manifest.raw_artists, + manifest.raw_artists_sha256, + manifest.raw_albums, + manifest.raw_albums_sha256, + manifest.raw_playlists, + manifest.raw_playlists_sha256, + manifest.raw_playlist_tracks, + manifest.raw_playlist_tracks_sha256, + manifest.raw_user_library_events, + manifest.raw_user_library_events_sha256, + totals.total_tracks, + coalesce(totals.liked_tracks, 0) as liked_tracks, + totals.albums, + artists.artists, + playlists.playlists, + coalesce(playlists.playlist_track_slots, 0) as playlist_track_slots, + coalesce(playlists.playlist_unique_tracks, 0) as playlist_unique_tracks, + coalesce(round(totals.library_hours, 2), 0) as library_hours, + coalesce(round(artists.top_artist_track_count * 1.0 / nullif(totals.total_tracks, 0), 3), 0) as top_artist_concentration, + genres.known_genres, + coalesce(genres.top_genre_share, 0) as top_genre_share, + coalesce(track_signals.underrated_tracks, 0) as underrated_tracks, + coalesce(track_signals.max_repeat_signal, 0) as max_repeat_signal, + periods.active_months, + coalesce(periods.busiest_month_events, 0) as busiest_month_events, + coalesce(playlist_signals.underrated_playlists, 0) as underrated_playlists, + freshness.last_ingested_at, + coalesce(freshness.ingestion_age_hours, 0) as ingestion_age_hours, + case + when freshness.last_ingested_at is null then 1 + when freshness.ingestion_age_hours > 168 then 1 + else 0 + end as stale_ingestion_flag, + current_timestamp as calculated_at +from totals +cross join playlists +cross join artists +cross join genres +cross join track_signals +cross join periods +cross join playlist_signals +cross join freshness +cross join manifest diff --git a/dbt/models/yamusic/marts/yamusic_period_activity.sql b/dbt/models/yamusic/marts/yamusic_period_activity.sql new file mode 100644 index 0000000..960ed00 --- /dev/null +++ b/dbt/models/yamusic/marts/yamusic_period_activity.sql @@ -0,0 +1,58 @@ +with events as ( + select + date_trunc('month', event_ts) as activity_month, + event_type, + track_id + from {{ ref('yamusic_fact_library_events') }} + where event_ts is not null +), + +event_tracks as ( + select + events.activity_month, + events.event_type, + tracks.track_id, + tracks.artist_names, + tracks.genre + from events + left join {{ ref('yamusic_dim_tracks') }} as tracks using (track_id) +), + +expanded_artists as ( + select + activity_month, + unnest(artist_names) as artist_name + from event_tracks + where artist_names is not null +), + +event_summary as ( + select + activity_month, + count(distinct track_id) as active_tracks, + count(*) as event_count, + sum(case when event_type = 'liked_track' then 1 else 0 end) as liked_events, + sum(case when event_type = 'playlist_membership' then 1 else 0 end) as playlist_events, + count(distinct genre) filter (where genre is not null) as active_genres + from event_tracks + group by 1 +), + +artist_summary as ( + select + activity_month, + count(distinct artist_name) as active_artists + from expanded_artists + group by 1 +) + +select + event_summary.activity_month, + event_summary.active_tracks, + event_summary.event_count, + event_summary.liked_events, + event_summary.playlist_events, + coalesce(artist_summary.active_artists, 0) as active_artists, + event_summary.active_genres +from event_summary +left join artist_summary using (activity_month) diff --git a/dbt/models/yamusic/marts/yamusic_playlist_overlap.sql b/dbt/models/yamusic/marts/yamusic_playlist_overlap.sql new file mode 100644 index 0000000..a10fd11 --- /dev/null +++ b/dbt/models/yamusic/marts/yamusic_playlist_overlap.sql @@ -0,0 +1,40 @@ +with pairs as ( + select + left_tracks.playlist_id as playlist_a_id, + right_tracks.playlist_id as playlist_b_id, + count(*) as overlap_track_count + from {{ ref('yamusic_fact_playlist_tracks') }} as left_tracks + join {{ ref('yamusic_fact_playlist_tracks') }} as right_tracks + on left_tracks.track_id = right_tracks.track_id + and left_tracks.playlist_id < right_tracks.playlist_id + group by 1, 2 +), + +playlist_sizes as ( + select + playlist_id, + count(distinct track_id) as track_count + from {{ ref('yamusic_fact_playlist_tracks') }} + group by 1 +) + +select + pairs.playlist_a_id, + playlist_a.playlist_title as playlist_a_title, + pairs.playlist_b_id, + playlist_b.playlist_title as playlist_b_title, + pairs.overlap_track_count, + round( + pairs.overlap_track_count * 1.0 + / nullif(size_a.track_count + size_b.track_count - pairs.overlap_track_count, 0), + 3 + ) as jaccard_overlap +from pairs +left join {{ ref('yamusic_dim_playlists') }} as playlist_a + on pairs.playlist_a_id = playlist_a.playlist_id +left join {{ ref('yamusic_dim_playlists') }} as playlist_b + on pairs.playlist_b_id = playlist_b.playlist_id +left join playlist_sizes as size_a + on pairs.playlist_a_id = size_a.playlist_id +left join playlist_sizes as size_b + on pairs.playlist_b_id = size_b.playlist_id diff --git a/dbt/models/yamusic/marts/yamusic_playlist_signals.sql b/dbt/models/yamusic/marts/yamusic_playlist_signals.sql new file mode 100644 index 0000000..4ff4449 --- /dev/null +++ b/dbt/models/yamusic/marts/yamusic_playlist_signals.sql @@ -0,0 +1,42 @@ +with overlap as ( + select + playlist_a_id as playlist_id, + max(jaccard_overlap) as max_overlap, + sum(overlap_track_count) as overlapped_track_mentions + from {{ ref('yamusic_playlist_overlap') }} + group by 1 + + union all + + select + playlist_b_id as playlist_id, + max(jaccard_overlap) as max_overlap, + sum(overlap_track_count) as overlapped_track_mentions + from {{ ref('yamusic_playlist_overlap') }} + group by 1 +), + +overlap_by_playlist as ( + select + playlist_id, + max(max_overlap) as max_overlap, + sum(overlapped_track_mentions) as overlapped_track_mentions + from overlap + group by 1 +) + +select + playlists.playlist_id, + playlists.playlist_title, + playlists.actual_track_count, + playlists.unique_track_count, + round(playlists.unique_track_count * 1.0 / nullif(playlists.actual_track_count, 0), 3) as uniqueness_ratio, + coalesce(overlap_by_playlist.max_overlap, 0) as max_overlap, + coalesce(overlap_by_playlist.overlapped_track_mentions, 0) as overlapped_track_mentions, + case + when playlists.unique_track_count >= 2 + and coalesce(overlap_by_playlist.max_overlap, 0) <= 0.25 then 1 + else 0 + end as underrated_playlist_flag +from {{ ref('yamusic_dim_playlists') }} as playlists +left join overlap_by_playlist using (playlist_id) diff --git a/dbt/models/yamusic/marts/yamusic_track_signals.sql b/dbt/models/yamusic/marts/yamusic_track_signals.sql new file mode 100644 index 0000000..6f29702 --- /dev/null +++ b/dbt/models/yamusic/marts/yamusic_track_signals.sql @@ -0,0 +1,41 @@ +with playlist_presence as ( + select + track_id, + count(*) as playlist_slots, + count(distinct playlist_id) as playlist_count + from {{ ref('yamusic_fact_playlist_tracks') }} + group by 1 +), + +events as ( + select + track_id, + count(*) as event_count, + min(event_ts) as first_event_ts, + max(event_ts) as last_event_ts + from {{ ref('yamusic_fact_library_events') }} + where track_id is not null + group by 1 +) + +select + tracks.track_id, + tracks.title, + tracks.artist_display, + tracks.album_title, + tracks.genre, + tracks.release_year, + tracks.liked, + coalesce(playlist_presence.playlist_slots, 0) as playlist_slots, + coalesce(playlist_presence.playlist_count, 0) as playlist_count, + coalesce(events.event_count, 0) as event_count, + events.first_event_ts, + events.last_event_ts, + coalesce(playlist_presence.playlist_slots, 0) + coalesce(events.event_count, 0) as repeat_signal, + case + when tracks.liked and coalesce(playlist_presence.playlist_count, 0) <= 1 then 1 + else 0 + end as underrated_flag +from {{ ref('yamusic_dim_tracks') }} as tracks +left join playlist_presence using (track_id) +left join events using (track_id) diff --git a/dbt/models/yamusic/schema.yml b/dbt/models/yamusic/schema.yml new file mode 100644 index 0000000..a1e14c2 --- /dev/null +++ b/dbt/models/yamusic/schema.yml @@ -0,0 +1,235 @@ +version: 2 + +models: + - name: stg_yamusic_tracks + description: Normalized Yandex Music track metadata from local raw JSONL. + columns: + - name: track_id + tests: [not_null, unique] + - name: title + tests: [not_null] + + - name: stg_yamusic_artists + description: Normalized artist metadata from Yandex Music tracks. + columns: + - name: artist_id + tests: [not_null, unique] + - name: artist_name + tests: [not_null] + + - name: stg_yamusic_playlists + description: User playlists from Yandex Music. + columns: + - name: playlist_id + tests: [not_null, unique] + + - name: stg_yamusic_playlist_tracks + description: Track membership in Yandex Music playlists. + columns: + - name: playlist_id + tests: + - not_null + - relationships: + arguments: + to: ref('stg_yamusic_playlists') + field: playlist_id + - name: track_id + tests: + - not_null + - relationships: + arguments: + to: ref('stg_yamusic_tracks') + field: track_id + + - name: stg_yamusic_user_library_events + description: Synthetic library events derived from liked tracks and playlist membership. + columns: + - name: event_id + tests: [not_null, unique] + - name: event_type + tests: + - not_null + - accepted_values: + arguments: + values: ['liked_track', 'playlist_membership'] + + - name: stg_yamusic_manifest + description: Latest raw ingestion manifest with source type, generation time and per-dataset row counts. + columns: + - name: manifest_source + tests: + - not_null + - accepted_values: + arguments: + values: ['sample', 'yandex_music'] + - name: manifest_generated_at + tests: [not_null] + - name: adapter_name + tests: [not_null] + - name: adapter_version + tests: [not_null] + - name: client_library + tests: [not_null] + - name: diagnostic_liked_shortcuts_seen + tests: [not_null] + - name: diagnostic_playlist_tracks_seen + tests: [not_null] + + - name: yamusic_dim_tracks + description: Track dimension for local music self-analytics. + columns: + - name: track_id + tests: [not_null, unique] + - name: genre + description: Optional Yandex Music genre when available; null means the source did not expose genre metadata. + + - name: yamusic_dim_artists + description: Artist dimension for local music self-analytics. + columns: + - name: artist_id + tests: [not_null, unique] + + - name: yamusic_dim_playlists + description: Playlist dimension with declared and observed track counts. + columns: + - name: playlist_id + tests: [not_null, unique] + + - name: yamusic_fact_playlist_tracks + description: Playlist-track bridge fact table. + + - name: yamusic_fact_library_events + description: User-library events derived from the available Yandex Music metadata. + columns: + - name: event_id + tests: [not_null, unique] + + - name: yamusic_artist_affinity + description: Artist-level concentration and playlist affinity metrics. + + - name: yamusic_track_signals + description: Track-level repeat and underrated signals for self-analytics. + columns: + - name: track_id + tests: + - not_null + - unique + - name: underrated_flag + tests: + - accepted_values: + arguments: + values: [0, 1] + + - name: yamusic_period_activity + description: Month-level activity timeline derived from available library events. + columns: + - name: activity_month + tests: + - not_null + - unique + + - name: yamusic_genre_profile + description: Genre distribution and diversity profile when genre metadata is available. + columns: + - name: genre + tests: + - not_null + - unique + + - name: yamusic_genre_periods + description: Month-by-genre activity distribution for spotting genre shifts over time. + tests: + - dbt_utils.unique_combination_of_columns: + arguments: + combination_of_columns: [activity_month, genre] + columns: + - name: activity_month + tests: + - not_null + - name: genre + tests: + - not_null + - name: event_share_in_month + tests: + - not_null + + - name: yamusic_playlist_overlap + description: Pairwise playlist overlap with Jaccard similarity. + + - name: yamusic_playlist_signals + description: Playlist-level uniqueness, overlap and underrated playlist signals. + columns: + - name: playlist_id + tests: + - not_null + - unique + - name: underrated_playlist_flag + tests: + - accepted_values: + arguments: + values: [0, 1] + + - name: yamusic_library_profile + description: One-row self-analytics profile for the local library snapshot. + columns: + - name: manifest_source + description: Source of the latest raw ingestion run, either deterministic sample data or real Yandex Music metadata. + tests: + - not_null + - accepted_values: + arguments: + values: ['sample', 'yandex_music'] + - name: adapter_name + description: Ingestion adapter that wrote the latest raw manifest. + tests: [not_null] + - name: adapter_version + description: Local ingestion adapter version from the `yamusic_ingest` package. + tests: [not_null] + - name: diagnostic_liked_shortcuts_fetch_failed + description: Count of liked-track shortcuts that failed hydration after retries; count only, no IDs or titles. + tests: [not_null] + - name: diagnostic_liked_tracks_duplicate_skipped + description: Count of duplicate liked-track shortcuts skipped after the first stable track ID was written. + tests: [not_null] + - name: diagnostic_liked_albums_seen + description: Count of liked album metadata objects returned by Yandex Music. + tests: [not_null] + - name: diagnostic_liked_albums_written + description: Count of liked album metadata objects written into the local album dimension feed. + tests: [not_null] + - name: diagnostic_liked_artists_seen + description: Count of liked artist metadata objects returned by Yandex Music. + tests: [not_null] + - name: diagnostic_liked_artists_written + description: Count of liked artist metadata objects written into the local artist dimension feed. + tests: [not_null] + - name: diagnostic_liked_playlists_seen + description: Count of liked playlist metadata objects returned by Yandex Music. + tests: [not_null] + - name: diagnostic_liked_playlists_written + description: Count of liked playlist metadata objects written into the local playlist dimension feed. + tests: [not_null] + - name: diagnostic_playlist_tracks_missing_track_id + description: Count of playlist track entries skipped because no stable track ID was exposed. + tests: [not_null] + - name: diagnostic_playlist_tracks_fetch_failed + description: Count of playlist track shortcuts that failed full-track hydration after retries; rows may still be written from shortcut metadata. + tests: [not_null] + - name: diagnostic_playlist_tracks_duplicate_skipped + description: Count of duplicate playlist-track memberships skipped after the first playlist-track pair was written. + tests: [not_null] + - name: raw_tracks_sha256 + description: SHA256 checksum of the raw tracks JSONL file recorded in the ingestion manifest. + tests: [not_null] + - name: raw_playlist_tracks_sha256 + description: SHA256 checksum of the raw playlist-track JSONL file recorded in the ingestion manifest. + tests: [not_null] + - name: raw_user_library_events_sha256 + description: SHA256 checksum of the raw user-library-events JSONL file recorded in the ingestion manifest. + tests: [not_null] + - name: stale_ingestion_flag + description: 1 when no ingestion timestamp is present or the newest local library event is older than 168 hours. + tests: + - accepted_values: + arguments: + values: [0, 1] diff --git a/dbt/models/yamusic/staging/stg_yamusic_albums.sql b/dbt/models/yamusic/staging/stg_yamusic_albums.sql new file mode 100644 index 0000000..3f08176 --- /dev/null +++ b/dbt/models/yamusic/staging/stg_yamusic_albums.sql @@ -0,0 +1,34 @@ +with source as ( + select * + from read_json( + '../{{ env_var("STREAMIFY_RAW_DIR", "data/raw/yamusic") }}/albums.jsonl', + columns={ + album_id: 'VARCHAR', + album_title: 'VARCHAR', + genre: 'VARCHAR', + release_year: 'INTEGER', + source: 'VARCHAR', + ingested_at: 'TIMESTAMP' + } + ) +), + +deduped as ( + select + cast(album_id as varchar) as album_id, + nullif(album_title, '') as album_title, + nullif(genre, '') as genre, + cast(release_year as integer) as release_year, + source, + cast(ingested_at as timestamp) as ingested_at, + row_number() over ( + partition by cast(album_id as varchar) + order by cast(ingested_at as timestamp) desc + ) as row_num + from source + where album_id is not null +) + +select * exclude (row_num) +from deduped +where row_num = 1 diff --git a/dbt/models/yamusic/staging/stg_yamusic_artists.sql b/dbt/models/yamusic/staging/stg_yamusic_artists.sql new file mode 100644 index 0000000..744b961 --- /dev/null +++ b/dbt/models/yamusic/staging/stg_yamusic_artists.sql @@ -0,0 +1,30 @@ +with source as ( + select * + from read_json( + '../{{ env_var("STREAMIFY_RAW_DIR", "data/raw/yamusic") }}/artists.jsonl', + columns={ + artist_id: 'VARCHAR', + artist_name: 'VARCHAR', + source: 'VARCHAR', + ingested_at: 'TIMESTAMP' + } + ) +), + +deduped as ( + select + cast(artist_id as varchar) as artist_id, + nullif(artist_name, '') as artist_name, + source, + cast(ingested_at as timestamp) as ingested_at, + row_number() over ( + partition by cast(artist_id as varchar) + order by cast(ingested_at as timestamp) desc + ) as row_num + from source + where artist_id is not null +) + +select * exclude (row_num) +from deduped +where row_num = 1 diff --git a/dbt/models/yamusic/staging/stg_yamusic_manifest.sql b/dbt/models/yamusic/staging/stg_yamusic_manifest.sql new file mode 100644 index 0000000..f0773a3 --- /dev/null +++ b/dbt/models/yamusic/staging/stg_yamusic_manifest.sql @@ -0,0 +1,53 @@ +with source as ( + select * + from read_json_auto('../{{ env_var("STREAMIFY_RAW_DIR", "data/raw/yamusic") }}/_manifest.json') +) + +select + cast(source as varchar) as manifest_source, + cast(generated_at as timestamp) as manifest_generated_at, + cast(raw_dir as varchar) as manifest_raw_dir, + coalesce(cast(json_only as boolean), false) as manifest_json_only, + cast(adapter.adapter_name as varchar) as adapter_name, + cast(adapter.adapter_version as varchar) as adapter_version, + cast(adapter.client_library as varchar) as client_library, + cast(adapter.client_library_version as varchar) as client_library_version, + coalesce(cast(json_extract_string(to_json(diagnostics), '$.liked_shortcuts_seen') as bigint), 0) as diagnostic_liked_shortcuts_seen, + coalesce(cast(json_extract_string(to_json(diagnostics), '$.liked_tracks_written') as bigint), 0) as diagnostic_liked_tracks_written, + coalesce(cast(json_extract_string(to_json(diagnostics), '$.liked_shortcuts_fetch_failed') as bigint), 0) as diagnostic_liked_shortcuts_fetch_failed, + coalesce(cast(json_extract_string(to_json(diagnostics), '$.liked_shortcuts_missing_track_id') as bigint), 0) as diagnostic_liked_shortcuts_missing_track_id, + coalesce(cast(json_extract_string(to_json(diagnostics), '$.liked_tracks_duplicate_skipped') as bigint), 0) as diagnostic_liked_tracks_duplicate_skipped, + coalesce(cast(json_extract_string(to_json(diagnostics), '$.liked_albums_seen') as bigint), 0) as diagnostic_liked_albums_seen, + coalesce(cast(json_extract_string(to_json(diagnostics), '$.liked_albums_written') as bigint), 0) as diagnostic_liked_albums_written, + coalesce(cast(json_extract_string(to_json(diagnostics), '$.liked_albums_missing_id') as bigint), 0) as diagnostic_liked_albums_missing_id, + coalesce(cast(json_extract_string(to_json(diagnostics), '$.liked_albums_duplicate_skipped') as bigint), 0) as diagnostic_liked_albums_duplicate_skipped, + coalesce(cast(json_extract_string(to_json(diagnostics), '$.liked_artists_seen') as bigint), 0) as diagnostic_liked_artists_seen, + coalesce(cast(json_extract_string(to_json(diagnostics), '$.liked_artists_written') as bigint), 0) as diagnostic_liked_artists_written, + coalesce(cast(json_extract_string(to_json(diagnostics), '$.liked_artists_missing_id') as bigint), 0) as diagnostic_liked_artists_missing_id, + coalesce(cast(json_extract_string(to_json(diagnostics), '$.liked_artists_duplicate_skipped') as bigint), 0) as diagnostic_liked_artists_duplicate_skipped, + coalesce(cast(json_extract_string(to_json(diagnostics), '$.liked_playlists_seen') as bigint), 0) as diagnostic_liked_playlists_seen, + coalesce(cast(json_extract_string(to_json(diagnostics), '$.liked_playlists_written') as bigint), 0) as diagnostic_liked_playlists_written, + coalesce(cast(json_extract_string(to_json(diagnostics), '$.liked_playlists_missing_id') as bigint), 0) as diagnostic_liked_playlists_missing_id, + coalesce(cast(json_extract_string(to_json(diagnostics), '$.liked_playlists_duplicate_skipped') as bigint), 0) as diagnostic_liked_playlists_duplicate_skipped, + coalesce(cast(json_extract_string(to_json(diagnostics), '$.playlists_seen') as bigint), 0) as diagnostic_playlists_seen, + coalesce(cast(json_extract_string(to_json(diagnostics), '$.playlists_written') as bigint), 0) as diagnostic_playlists_written, + coalesce(cast(json_extract_string(to_json(diagnostics), '$.playlists_missing_id') as bigint), 0) as diagnostic_playlists_missing_id, + coalesce(cast(json_extract_string(to_json(diagnostics), '$.playlist_fetch_fallbacks') as bigint), 0) as diagnostic_playlist_fetch_fallbacks, + coalesce(cast(json_extract_string(to_json(diagnostics), '$.playlist_tracks_seen') as bigint), 0) as diagnostic_playlist_tracks_seen, + coalesce(cast(json_extract_string(to_json(diagnostics), '$.playlist_tracks_written') as bigint), 0) as diagnostic_playlist_tracks_written, + coalesce(cast(json_extract_string(to_json(diagnostics), '$.playlist_tracks_fetch_failed') as bigint), 0) as diagnostic_playlist_tracks_fetch_failed, + coalesce(cast(json_extract_string(to_json(diagnostics), '$.playlist_tracks_missing_track_id') as bigint), 0) as diagnostic_playlist_tracks_missing_track_id, + coalesce(cast(json_extract_string(to_json(diagnostics), '$.playlist_tracks_duplicate_skipped') as bigint), 0) as diagnostic_playlist_tracks_duplicate_skipped, + coalesce(cast(datasets.tracks.row_count as bigint), 0) as raw_tracks, + cast(datasets.tracks.jsonl_sha256 as varchar) as raw_tracks_sha256, + coalesce(cast(datasets.artists.row_count as bigint), 0) as raw_artists, + cast(datasets.artists.jsonl_sha256 as varchar) as raw_artists_sha256, + coalesce(cast(datasets.albums.row_count as bigint), 0) as raw_albums, + cast(datasets.albums.jsonl_sha256 as varchar) as raw_albums_sha256, + coalesce(cast(datasets.playlists.row_count as bigint), 0) as raw_playlists, + cast(datasets.playlists.jsonl_sha256 as varchar) as raw_playlists_sha256, + coalesce(cast(datasets.playlist_tracks.row_count as bigint), 0) as raw_playlist_tracks, + cast(datasets.playlist_tracks.jsonl_sha256 as varchar) as raw_playlist_tracks_sha256, + coalesce(cast(datasets.user_library_events.row_count as bigint), 0) as raw_user_library_events, + cast(datasets.user_library_events.jsonl_sha256 as varchar) as raw_user_library_events_sha256 +from source diff --git a/dbt/models/yamusic/staging/stg_yamusic_playlist_tracks.sql b/dbt/models/yamusic/staging/stg_yamusic_playlist_tracks.sql new file mode 100644 index 0000000..0e86cf3 --- /dev/null +++ b/dbt/models/yamusic/staging/stg_yamusic_playlist_tracks.sql @@ -0,0 +1,19 @@ +select distinct + cast(playlist_id as varchar) as playlist_id, + cast(track_id as varchar) as track_id, + cast(position as integer) as position, + source, + cast(ingested_at as timestamp) as ingested_at +from read_json( + '../{{ env_var("STREAMIFY_RAW_DIR", "data/raw/yamusic") }}/playlist_tracks.jsonl', + columns={ + playlist_id: 'VARCHAR', + track_id: 'VARCHAR', + position: 'INTEGER', + added_at: 'TIMESTAMP', + source: 'VARCHAR', + ingested_at: 'TIMESTAMP' + } +) +where playlist_id is not null + and track_id is not null diff --git a/dbt/models/yamusic/staging/stg_yamusic_playlists.sql b/dbt/models/yamusic/staging/stg_yamusic_playlists.sql new file mode 100644 index 0000000..1727ca7 --- /dev/null +++ b/dbt/models/yamusic/staging/stg_yamusic_playlists.sql @@ -0,0 +1,32 @@ +with source as ( + select * + from read_json( + '../{{ env_var("STREAMIFY_RAW_DIR", "data/raw/yamusic") }}/playlists.jsonl', + columns={ + playlist_id: 'VARCHAR', + playlist_title: 'VARCHAR', + track_count: 'INTEGER', + source: 'VARCHAR', + ingested_at: 'TIMESTAMP' + } + ) +), + +deduped as ( + select + cast(playlist_id as varchar) as playlist_id, + nullif(playlist_title, '') as playlist_title, + cast(track_count as integer) as declared_track_count, + source, + cast(ingested_at as timestamp) as ingested_at, + row_number() over ( + partition by cast(playlist_id as varchar) + order by cast(ingested_at as timestamp) desc + ) as row_num + from source + where playlist_id is not null +) + +select * exclude (row_num) +from deduped +where row_num = 1 diff --git a/dbt/models/yamusic/staging/stg_yamusic_tracks.sql b/dbt/models/yamusic/staging/stg_yamusic_tracks.sql new file mode 100644 index 0000000..63cff0c --- /dev/null +++ b/dbt/models/yamusic/staging/stg_yamusic_tracks.sql @@ -0,0 +1,48 @@ +with source as ( + select * + from read_json( + '../{{ env_var("STREAMIFY_RAW_DIR", "data/raw/yamusic") }}/tracks.jsonl', + columns={ + track_id: 'VARCHAR', + title: 'VARCHAR', + duration_ms: 'BIGINT', + album_id: 'VARCHAR', + album_title: 'VARCHAR', + genre: 'VARCHAR', + release_year: 'INTEGER', + label: 'VARCHAR', + artist_ids: 'VARCHAR[]', + artist_names: 'VARCHAR[]', + liked: 'BOOLEAN', + source: 'VARCHAR', + ingested_at: 'TIMESTAMP' + } + ) +), + +deduped as ( + select + cast(track_id as varchar) as track_id, + nullif(title, '') as title, + cast(duration_ms as bigint) as duration_ms, + cast(album_id as varchar) as album_id, + album_title, + nullif(genre, '') as genre, + cast(release_year as integer) as release_year, + nullif(label, '') as label, + artist_ids, + artist_names, + coalesce(cast(liked as boolean), false) as liked, + source, + cast(ingested_at as timestamp) as ingested_at, + row_number() over ( + partition by cast(track_id as varchar) + order by cast(ingested_at as timestamp) desc + ) as row_num + from source + where track_id is not null +) + +select * exclude (row_num) +from deduped +where row_num = 1 diff --git a/dbt/models/yamusic/staging/stg_yamusic_user_library_events.sql b/dbt/models/yamusic/staging/stg_yamusic_user_library_events.sql new file mode 100644 index 0000000..cbba4b4 --- /dev/null +++ b/dbt/models/yamusic/staging/stg_yamusic_user_library_events.sql @@ -0,0 +1,21 @@ +select distinct + cast(event_id as varchar) as event_id, + cast(event_type as varchar) as event_type, + cast(track_id as varchar) as track_id, + cast(playlist_id as varchar) as playlist_id, + cast(event_ts as timestamp) as event_ts, + source, + cast(ingested_at as timestamp) as ingested_at +from read_json( + '../{{ env_var("STREAMIFY_RAW_DIR", "data/raw/yamusic") }}/user_library_events.jsonl', + columns={ + event_id: 'VARCHAR', + event_type: 'VARCHAR', + track_id: 'VARCHAR', + playlist_id: 'VARCHAR', + event_ts: 'TIMESTAMP', + source: 'VARCHAR', + ingested_at: 'TIMESTAMP' + } +) +where event_id is not null diff --git a/dbt/models/yamusic/yamusic_genre_periods.sql b/dbt/models/yamusic/yamusic_genre_periods.sql new file mode 100644 index 0000000..972c9bf --- /dev/null +++ b/dbt/models/yamusic/yamusic_genre_periods.sql @@ -0,0 +1,33 @@ +with event_tracks as ( + select + date_trunc('month', events.event_ts)::date as activity_month, + coalesce(tracks.genre, 'unknown') as genre, + events.event_type, + events.track_id + from {{ ref('yamusic_fact_library_events') }} as events + left join {{ ref('yamusic_dim_tracks') }} as tracks + on events.track_id = tracks.track_id + where events.event_ts is not null +), + +monthly_genres as ( + select + activity_month, + genre, + count(*) as event_count, + count(distinct track_id) as active_tracks, + sum(case when event_type = 'liked_track' then 1 else 0 end) as liked_events, + sum(case when event_type = 'playlist_membership' then 1 else 0 end) as playlist_events + from event_tracks + group by 1, 2 +) + +select + activity_month, + genre, + event_count, + active_tracks, + liked_events, + playlist_events, + round(event_count * 1.0 / nullif(sum(event_count) over (partition by activity_month), 0), 3) as event_share_in_month +from monthly_genres diff --git a/dbt/profiles.yml b/dbt/profiles.yml index ddab0f7..90354b4 100644 --- a/dbt/profiles.yml +++ b/dbt/profiles.yml @@ -22,4 +22,8 @@ streamify: threads: 4 timeout_seconds: 300 type: bigquery - target: dev \ No newline at end of file + local: + type: duckdb + path: "../{{ env_var('STREAMIFY_DUCKDB_PATH', 'data/streamify.duckdb') }}" + threads: "{{ env_var('DBT_THREADS', 1) | int }}" + target: dev diff --git a/docker-compose.local.yml b/docker-compose.local.yml new file mode 100644 index 0000000..1c20894 --- /dev/null +++ b/docker-compose.local.yml @@ -0,0 +1,77 @@ +services: + streamify-local: + profiles: ["local"] + build: + context: . + dockerfile: local/Dockerfile + environment: + YANDEX_MUSIC_TOKEN: ${YANDEX_MUSIC_TOKEN:-} + STREAMIFY_DATA_DIR: ${STREAMIFY_DATA_DIR:-data} + STREAMIFY_RAW_DIR: ${STREAMIFY_RAW_DIR:-data/raw/yamusic} + STREAMIFY_DUCKDB_PATH: ${STREAMIFY_DUCKDB_PATH:-data/streamify.duckdb} + STREAMIFY_REPORT_PATH: ${STREAMIFY_REPORT_PATH:-data/streamify_summary.md} + STREAMIFY_SNAPSHOT_PATH: ${STREAMIFY_SNAPSHOT_PATH:-data/streamify_snapshot.json} + STREAMIFY_RECOMMENDATIONS_DIR: ${STREAMIFY_RECOMMENDATIONS_DIR:-data/recommendations} + DBT_THREADS: ${DBT_THREADS:-1} + volumes: + - ./data:/app/data + - ./dbt:/app/dbt + command: + - bash + - -lc + - | + set -euo pipefail + if [ -n "$$YANDEX_MUSIC_TOKEN" ]; then + python -m yamusic_ingest + READINESS_ARGS="--require-real" + else + python -m yamusic_ingest --sample + READINESS_ARGS="" + fi + python scripts/validate_yamusic_raw_contract.py + cd dbt + GCP_PROJECT_ID=dummy dbt deps + GCP_PROJECT_ID=dummy dbt build --profiles-dir . --target local --select yamusic + cd .. + python scripts/doctor_yamusic_local.py + python scripts/export_yamusic_summary.py + python scripts/export_yamusic_snapshot.py + python scripts/export_yamusic_recommendations.py + python scripts/audit_yamusic_readiness.py $$READINESS_ARGS + + dashboard: + profiles: ["local"] + build: + context: . + dockerfile: local/Dockerfile + depends_on: + streamify-local: + condition: service_completed_successfully + environment: + STREAMIFY_DUCKDB_PATH: ${STREAMIFY_DUCKDB_PATH:-data/streamify.duckdb} + volumes: + - ./data:/app/data + - ./dashboard:/app/dashboard + ports: + - "${STREAMIFY_DASHBOARD_PORT:-8501}:8501" + command: ["streamlit", "run", "dashboard/app.py", "--server.address=0.0.0.0", "--server.port=8501"] + + kafka: + image: confluentinc/cp-kafka:7.7.1 + profiles: ["kafka"] + ports: + - "9092:9092" + environment: + KAFKA_NODE_ID: 1 + KAFKA_PROCESS_ROLES: broker,controller + KAFKA_CONTROLLER_QUORUM_VOTERS: 1@kafka:9093 + KAFKA_LISTENERS: PLAINTEXT://kafka:29092,PLAINTEXT_HOST://0.0.0.0:9092,CONTROLLER://kafka:9093 + KAFKA_ADVERTISED_LISTENERS: PLAINTEXT://kafka:29092,PLAINTEXT_HOST://localhost:9092 + KAFKA_LISTENER_SECURITY_PROTOCOL_MAP: PLAINTEXT:PLAINTEXT,PLAINTEXT_HOST:PLAINTEXT,CONTROLLER:PLAINTEXT + KAFKA_CONTROLLER_LISTENER_NAMES: CONTROLLER + KAFKA_INTER_BROKER_LISTENER_NAME: PLAINTEXT + KAFKA_OFFSETS_TOPIC_REPLICATION_FACTOR: 1 + KAFKA_GROUP_INITIAL_REBALANCE_DELAY_MS: 0 + KAFKA_TRANSACTION_STATE_LOG_MIN_ISR: 1 + KAFKA_TRANSACTION_STATE_LOG_REPLICATION_FACTOR: 1 + CLUSTER_ID: MkU3OEVBNTcwNTJENDM2Qk diff --git a/docs/product_acceptance.md b/docs/product_acceptance.md new file mode 100644 index 0000000..9c7d881 --- /dev/null +++ b/docs/product_acceptance.md @@ -0,0 +1,61 @@ +# Streamify Local Product Acceptance + +This document maps the MVP requirements to concrete repository artifacts and verification commands. The local product is considered ready for sample metadata after `make test` passes. It is considered ready for a real account only after `make acceptance-real` passes with a valid `YANDEX_MUSIC_TOKEN`. + +## Requirement Matrix + +| Requirement | Implementation Evidence | Verification | +| --- | --- | --- | +| Fully local run without GCP | `Makefile`, `docker-compose.local.yml`, `dbt/profiles.yml` local DuckDB target | `make acceptance-local`, `make compose-smoke-local` | +| Local operator entrypoint | `make help` lists sample, real-account, Docker, export, readiness and cleanup commands | `make help` | +| Docker Compose local product path | `docker-compose.local.yml` `local` profile with one-shot build, dashboard services, `set -euo pipefail`, real-source readiness enforcement when a token is configured, and compose smoke validation of raw/product/dashboard artifacts | `make up-local`, `make compose-smoke-local` | +| Yandex Music metadata ingestion | `yamusic_ingest/__main__.py`, `yamusic_ingest/yandex_client.py`; liked tracks, owned playlists, liked playlists, liked albums and liked artists where exposed by the API | `make preflight`, `make ingest`, `make acceptance-real` | +| No audio download or storage | metadata-only adapter, `.gitignore`, safety scripts | `scripts/check_no_audio_artifacts.py`, `scripts/check_no_local_sensitive_artifacts.py` | +| Raw normalized outputs | `tracks`, `artists`, `albums`, `playlists`, `playlist_tracks`, `user_library_events` JSONL/Parquet writers | `make ingest-sample`, `make raw-contract` | +| Bronze/silver/gold data engineering path | `data/raw/yamusic`, `stg_yamusic_*`, `yamusic_dim_*`, `yamusic_fact_*`, profile/signal marts | `make dbt-build`, `make doctor` | +| Source provenance and stale-build protection | `stg_yamusic_manifest`, manifest fields in `yamusic_library_profile`, doctor/readiness raw-count alignment checks | `make doctor`, `make readiness`, `make product-answers-smoke` | +| Idempotent local ingestion | overwrite-per-run raw writer, stale Parquet cleanup, and `_manifest.json` row counts | repeated `make ingest-sample`, `make raw-contract` | +| Data quality checks | dbt schema tests, raw contract, doctor, safety checks, empty-account smoke | `make test` | +| Practical self-analytics answers | `yamusic_artist_affinity`, `yamusic_genre_periods`, `yamusic_track_signals`, `yamusic_playlist_signals`, `yamusic_library_profile`, dashboard genre/liked/search filters, dashboard Actions/Data Quality tabs, dashboard content smoke, `data/streamify_summary.md`, `data/streamify_snapshot.json`, `data/recommendations/*.csv` | `make product-answers-smoke`, `make dashboard-smoke`, `make report`, `make snapshot`, `make recommendations`, `make dashboard` | +| Empty/private account handling | typed empty raw files and empty dbt smoke | `scripts/smoke_empty_yamusic_dbt.py`, `make test` | +| Token safety | `.env.example`, `.gitignore`, no token in manifest/report/status, preflight without raw writes | `make status`, `make preflight`, `scripts/check_no_local_sensitive_artifacts.py` | + +## Current Acceptance Status + +Sample metadata path: + +- `make acceptance-local` proves local ingestion, raw contract, DuckDB/dbt marts, doctor, report, readiness and dashboard startup. +- `make test` proves static contracts, safety guards, empty-account handling, sample acceptance, Python unit tests, Compose config and Compose local smoke. + +Real account path: + +- A valid `YANDEX_MUSIC_TOKEN` in `.env` is still required to prove real-account ingestion. +- The final real-account command is: + +```bash +make acceptance-real +make dashboard +``` + +The readiness audit must report `"real_account_verified": true` before the real-account MVP is considered proven. `make acceptance-real` enforces this through `make readiness-real`, which runs `scripts/audit_yamusic_readiness.py --require-real`. + +## Product Answers Covered + +| User question | Primary artifact | +| --- | --- | +| Who are my strongest artists? | `yamusic_artist_affinity`, dashboard Artists tab | +| Which tracks repeat across library contexts? | `yamusic_track_signals.repeat_signal`, dashboard Tracks tab | +| How has genre composition shifted? | `yamusic_genre_periods`, dashboard Genres/Periods tabs | +| How diverse or concentrated is my library? | `yamusic_library_profile`, `yamusic_genre_profile` | +| Which playlists overlap? | `yamusic_playlist_overlap`, dashboard Playlists tab | +| Which tracks or playlists look underrated? | `yamusic_track_signals`, `yamusic_playlist_signals`, static report | +| What should I do next? | dashboard Actions tab, JSON snapshot `next_actions` | +| Can I open action queues in a spreadsheet? | `data/recommendations/*.csv`, `make recommendations` | +| Can I reuse the answers outside the dashboard? | `data/streamify_snapshot.json`, `make snapshot` | +| Is my local data trustworthy? | dashboard Data Quality tab, JSON snapshot quality block, `make doctor`, `make readiness` | + +## Known Product Limits + +- Yandex Music availability depends on the unofficial `yandex-music` package and account-visible metadata. +- Listening timestamps/history are used only when exposed by the account/API response; otherwise the product falls back to liked-track and playlist-membership events. +- The dashboard and report are analytics over metadata and derived events, not audio playback or audio feature extraction. diff --git a/docs/project_management.md b/docs/project_management.md new file mode 100644 index 0000000..feb2613 --- /dev/null +++ b/docs/project_management.md @@ -0,0 +1,27 @@ +# Streamify Project Management + +Streamify work is tracked as agent-sized GitHub issues. Each issue should define the product outcome, the owning lane, and concrete acceptance evidence. + +## Agent Lanes + +| Lane | Scope | Default proof | +| --- | --- | --- | +| Repo/Build | Local setup, Docker Compose, Makefile, CI, release automation | `make test`, `make compose-smoke-local` | +| Yandex Ingestion | Token-safe metadata ingestion, raw contracts, API edge cases | `make preflight`, `make ingest`, `make raw-contract` | +| Analytics/dbt | DuckDB targets, typed staging, marts, tests, lineage | `make dbt-build`, `make doctor` | +| Product/Dashboard | Streamlit UX, exports, product docs, answer quality | `make dashboard-smoke`, `make product-answers-smoke` | +| QA/Integration | End-to-end acceptance, privacy gates, release readiness | `make acceptance-local`, `make acceptance-real` | + +## Working Rules + +- Do not paste tokens, raw account data, DuckDB files, or screenshots with private listening data into issues. +- Use sample metadata for CI, GitHub Pages, and public release artifacts. +- Use `make acceptance-real` only on a trusted local machine with `.env`. +- Every PR should update tests or explain why the existing gates prove the change. +- Release candidates should link the GitHub issue set they close and include known API limitations. + +## Suggested Milestones + +1. `v0.1.0-local-mvp`: real-account metadata ingestion, DuckDB marts, dashboard, safety gates. +2. `v0.2.0-product-answers`: richer action queues, more dashboard filters, better empty/error states. +3. `v0.3.0-official-import`: optional official Yandex data archive importer if usable account export data is available. diff --git a/docs/release_process.md b/docs/release_process.md new file mode 100644 index 0000000..9ae023d --- /dev/null +++ b/docs/release_process.md @@ -0,0 +1,23 @@ +# Release Process + +Streamify releases are source-first and privacy-safe. Public artifacts must never include `.env`, real raw Yandex Music exports, DuckDB files, account snapshots, recommendation CSVs from a real account, or audio. + +## Release Checklist + +1. Open or update GitHub issues for the agent lanes included in the release. +2. Run `make test` locally. +3. If ingestion changed, run `make acceptance-real` locally and keep the generated data untracked. +4. Confirm `git status --ignored .env data` shows `.env` and generated data as ignored. +5. Update `docs/releases/vX.Y.Z.md`. +6. Tag the release: + +```bash +git tag vX.Y.Z +git push origin vX.Y.Z +``` + +The `Release` workflow runs sample-data validation, builds public docs, packages tracked source via `git archive`, and creates a GitHub release from the tag notes. + +## GitHub Pages + +The `GitHub Pages` workflow builds a static product site from README/docs plus sample metadata report artifacts. It intentionally clears `YANDEX_MUSIC_TOKEN` so public pages are reproducible and do not depend on a private account. diff --git a/docs/releases/v0.1.0.md b/docs/releases/v0.1.0.md new file mode 100644 index 0000000..edf3250 --- /dev/null +++ b/docs/releases/v0.1.0.md @@ -0,0 +1,21 @@ +# Streamify v0.1.0 Local MVP + +## Product Value + +Streamify turns a Yandex Music account into a local, reproducible self-analytics lakehouse. It answers practical questions about favorite artists and tracks, genre shifts, playlist overlap, repeated patterns, diversity, active periods, underrated tracks and playlists, data quality, and missing metadata. + +## Highlights + +- Metadata-only Yandex Music ingestion with token-safe `.env` handling. +- Local raw JSONL/optional Parquet outputs with manifest row counts and SHA256 checksums. +- DuckDB/dbt local target with typed staging, marts, relationship tests, and freshness/readiness checks. +- Streamlit dashboard for overview, periods, artists, genres, playlists, tracks, action queues, and data quality. +- Static summary, JSON snapshot, and CSV recommendation exports. +- Docker Compose local profile and sample-data CI path. +- Privacy gates preventing `.env`, raw account data, DuckDB files, recommendations, and audio from being tracked. + +## Known Limitations + +- Live Yandex Music ingestion uses an unofficial client library and can break if Yandex changes private endpoints. +- Full listening history is only available where the client/API exposes usable timestamps; otherwise analytics are based on library, likes, playlists, and metadata. +- Public CI, Pages, and release artifacts use sample metadata only. diff --git a/docs/yamusic_lineage.md b/docs/yamusic_lineage.md new file mode 100644 index 0000000..399df23 --- /dev/null +++ b/docs/yamusic_lineage.md @@ -0,0 +1,134 @@ +# Yandex Music Local Lineage + +This catalog documents the local metadata-only data path. Streamify does not download, store, transform, or play audio. + +## Layer Map + +| Layer | Artifact | Purpose | +| --- | --- | --- | +| Raw/Bronze | `data/raw/yamusic/tracks.jsonl` | Track metadata, album fields, artist arrays, liked flag, source and ingestion timestamp. | +| Raw/Bronze | `data/raw/yamusic/artists.jsonl` | Artist metadata discovered from tracks and account-visible liked artists. | +| Raw/Bronze | `data/raw/yamusic/albums.jsonl` | Album metadata discovered from tracks and account-visible liked albums. | +| Raw/Bronze | `data/raw/yamusic/playlists.jsonl` | Owned playlist metadata plus account-visible liked playlists and declared track counts where exposed by Yandex Music. | +| Raw/Bronze | `data/raw/yamusic/playlist_tracks.jsonl` | Playlist-track membership and positions. | +| Raw/Bronze | `data/raw/yamusic/user_library_events.jsonl` | Derived metadata events for liked tracks and playlist membership. | +| Raw/Bronze | `data/raw/yamusic/_manifest.json` | Source, generated timestamp, adapter/client metadata, diagnostics counters, output paths, row counts and JSONL checksums. It must not contain token material. | +| Silver | `stg_yamusic_manifest` | Parsed ingestion manifest with source, generated timestamp, JSON-only flag, adapter/client metadata, diagnostics counters and raw row counts. | +| Silver | `stg_yamusic_*` | Typed DuckDB reads, dedupe, null normalization and relationship-ready keys. | +| Gold | `yamusic_dim_*`, `yamusic_fact_*`, `yamusic_*_profile`, `yamusic_*_signals` | Practical marts for self-analytics and dashboard views. | +| App | `dashboard/app.py` | Streamlit interface over `data/streamify.duckdb`. | +| Report | `data/streamify_summary.md` | Static answer-first self-analytics summary exported from the same DuckDB marts. | +| Snapshot | `data/streamify_snapshot.json` | Schema-versioned JSON self-analytics snapshot for automation, CI artifacts and downstream agent workflows. | +| Recommendations | `data/recommendations/*.csv` | Spreadsheet-friendly action queues for rediscovery, playlist cleanup, standout playlists, top artists and genre shifts. | + +## Lineage + +```mermaid +flowchart LR + ingest["yamusic_ingest\nmetadata only"] --> raw_tracks["tracks.jsonl"] + ingest --> raw_artists["artists.jsonl"] + ingest --> raw_albums["albums.jsonl"] + ingest --> raw_playlists["playlists.jsonl"] + ingest --> raw_playlist_tracks["playlist_tracks.jsonl"] + ingest --> raw_events["user_library_events.jsonl"] + ingest --> raw_manifest["_manifest.json"] + + raw_tracks --> stg_tracks["stg_yamusic_tracks"] + raw_artists --> stg_artists["stg_yamusic_artists"] + raw_albums --> stg_albums["stg_yamusic_albums"] + raw_playlists --> stg_playlists["stg_yamusic_playlists"] + raw_playlist_tracks --> stg_playlist_tracks["stg_yamusic_playlist_tracks"] + raw_events --> stg_events["stg_yamusic_user_library_events"] + + stg_tracks --> dim_tracks["yamusic_dim_tracks"] + stg_artists --> dim_artists["yamusic_dim_artists"] + stg_albums --> dim_albums["yamusic_dim_albums"] + stg_playlists --> dim_playlists["yamusic_dim_playlists"] + stg_playlist_tracks --> fact_playlist_tracks["yamusic_fact_playlist_tracks"] + stg_events --> fact_events["yamusic_fact_library_events"] + + dim_tracks --> artist_affinity["yamusic_artist_affinity"] + fact_playlist_tracks --> artist_affinity + dim_tracks --> genre_profile["yamusic_genre_profile"] + dim_tracks --> track_signals["yamusic_track_signals"] + fact_playlist_tracks --> track_signals + fact_events --> track_signals + fact_events --> period_activity["yamusic_period_activity"] + dim_tracks --> period_activity + fact_events --> genre_periods["yamusic_genre_periods"] + dim_tracks --> genre_periods + fact_playlist_tracks --> playlist_overlap["yamusic_playlist_overlap"] + dim_playlists --> playlist_overlap + dim_playlists --> playlist_signals["yamusic_playlist_signals"] + playlist_overlap --> playlist_signals + + artist_affinity --> library_profile["yamusic_library_profile"] + genre_profile --> library_profile + track_signals --> library_profile + period_activity --> library_profile + playlist_signals --> library_profile + dim_tracks --> library_profile + dim_playlists --> library_profile + fact_events --> library_profile + + library_profile --> dashboard["Streamlit dashboard"] + library_profile --> report["Markdown summary"] + library_profile --> snapshot["JSON snapshot"] + library_profile --> recommendations["Recommendation CSVs"] + dim_tracks --> dashboard + dim_tracks --> snapshot + artist_affinity --> dashboard + artist_affinity --> report + artist_affinity --> snapshot + artist_affinity --> recommendations + period_activity --> dashboard + period_activity --> snapshot + genre_periods --> dashboard + genre_periods --> report + genre_periods --> snapshot + genre_periods --> recommendations + genre_profile --> dashboard + genre_profile --> snapshot + playlist_overlap --> dashboard + playlist_overlap --> snapshot + playlist_overlap --> recommendations + playlist_signals --> dashboard + playlist_signals --> report + playlist_signals --> snapshot + playlist_signals --> recommendations + track_signals --> dashboard + track_signals --> report + track_signals --> snapshot + track_signals --> recommendations +``` + +## Product Questions + +| Product question | Primary model | Supporting models | +| --- | --- | --- | +| Favorite artists | `yamusic_artist_affinity` | `yamusic_dim_tracks`, `yamusic_fact_playlist_tracks` | +| Favorite tracks | `yamusic_dim_tracks` | `yamusic_track_signals` | +| Genre shifts | `yamusic_genre_periods` | `yamusic_period_activity`, `yamusic_genre_profile` | +| Repeats | `yamusic_track_signals.repeat_signal` | `yamusic_fact_library_events`, `yamusic_fact_playlist_tracks` | +| Diversity | `yamusic_genre_profile`, `yamusic_library_profile` | `yamusic_artist_affinity` | +| Active periods | `yamusic_period_activity` | `yamusic_fact_library_events` | +| Underrated tracks | `yamusic_track_signals.underrated_flag` | `yamusic_dim_tracks` | +| Underrated playlists | `yamusic_playlist_signals.underrated_playlist_flag` | `yamusic_playlist_overlap`, `yamusic_dim_playlists` | +| Data freshness | `yamusic_library_profile.stale_ingestion_flag` | `yamusic_fact_library_events` | + +## Quality Gates + +| Gate | Command | What it proves | +| --- | --- | --- | +| Raw contract | `make raw-contract` | Required raw JSONL fields, basic types, source values, event types, manifest row counts, JSONL sha256 checksums, ingestion diagnostics consistency, unique IDs and playlist/event referential integrity. | +| dbt build | `make dbt-build` | dbt packages resolve, then DuckDB marts compile, build and pass schema/relationship tests. | +| Local doctor | `make doctor` | Required raw files, manifest, mart tables and one-row profile exist, with DuckDB manifest source/raw counts matching the latest raw files. | +| Dashboard smoke | `make dashboard-smoke` | Streamlit starts against the local DuckDB file and returns HTTP 200. | +| Report export | `make report` | Static markdown summary and JSON snapshot can be generated from the local DuckDB marts. | +| Snapshot export | `make snapshot` | Schema-versioned JSON self-analytics snapshot can be generated independently for automation and agent workflows. | +| Recommendations export | `make recommendations` | Spreadsheet-friendly CSV queues can be generated independently for top artists, rediscovery tracks, playlist cleanup, standout playlists and genre shifts. | +| Product answers | `make product-answers-smoke` | Favorite artists/tracks, repeats, genre shifts, diversity, active periods, playlist overlap, underrated signals, source provenance and data-quality profile are queryable from marts/report/snapshot. | +| Readiness audit | `make readiness` | Current raw counts, DuckDB profile, report existence, no-audio invariant, sample-vs-real source status and stale-dbt protection are verified. | +| Compose smoke | `make compose-smoke-local` | Docker Compose `local` profile builds, ingests sample metadata, validates raw contract, builds marts, runs doctor, exports the report, runs readiness, and serves dashboard HTTP 200. | +| Full local gate | `make test` | Static validators, safety guards, empty-account smoke, sample acceptance, Python tests and Docker Compose smoke. | +| Real account gate | `make acceptance-real` | Real token preflight, real metadata ingestion, raw contract, dbt deps/build, doctor, report, `readiness-real` source enforcement and dashboard smoke. | diff --git a/docs/yandex_music_local.md b/docs/yandex_music_local.md new file mode 100644 index 0000000..049eda8 --- /dev/null +++ b/docs/yandex_music_local.md @@ -0,0 +1,184 @@ +# Local Yandex Music Analytics + +Streamify's local product track builds a personal music self-analytics lakehouse from Yandex Music metadata. It is designed to run on a laptop without GCP credentials or cloud spend. + +No audio is downloaded, stored, transformed, or played by this project. The ingestion adapter only reads metadata exposed to the account by the Yandex Music client library. + +## Quick Start + +```bash +cp .env.example .env +make setup +make help +make status +make ingest-sample +make raw-contract +make dbt-build +make doctor +make report +make readiness +make dashboard-smoke +make dashboard +``` + +To use real account metadata, set `YANDEX_MUSIC_TOKEN` in `.env` and run: + +```bash +make acceptance-real +make dashboard +``` + +## Yandex Music Token + +Streamify does not ask for your Yandex password and does not fetch a token by itself. The installed `yandex-music` Python client accepts an existing OAuth token through `Client(token).init()`, but version 2.2.0 does not expose a `device_auth` helper. + +Run `make token-help` for the short in-repo reminder. + +Use an external Yandex Music OAuth token helper, then paste only the resulting token into the local `.env` file: + +```env +YANDEX_MUSIC_TOKEN=your_oauth_token_here +``` + +Known community helper: + +- `https://github.com/MarshalX/yandex-music-token` + +Treat the token as a password. Do not commit `.env`, paste the token into chat, or add it to any tracked config. After saving `.env`, validate access without writing raw data: + +```bash +make preflight +``` + +The Python CLI and support scripts load `.env` directly. The Makefile invokes commands through `scripts/run_with_dotenv.py`, so `YANDEX_MUSIC_TOKEN`, `STREAMIFY_RAW_DIR`, `STREAMIFY_DUCKDB_PATH`, `STREAMIFY_REPORT_PATH`, `STREAMIFY_SNAPSHOT_PATH`, `STREAMIFY_RECOMMENDATIONS_DIR`, `STREAMIFY_DASHBOARD_PORT`, and `DBT_THREADS` behave the same in direct commands and `make` targets without Make parsing token values. + +Run `make help` when you want the shortest command map for sample metadata, real-account metadata, Docker Compose, exports, readiness and cleanup. + +The Yandex Music adapter uses bounded retries around client initialization, account-level API calls, playlist hydration and liked-track hydration. A transient failure should not immediately break `make preflight` or `make ingest`; repeated failures still surface as sanitized `YandexMusicIngestError` messages without printing the token. + +Run `make status` before a real account run when you want a safe local diagnostic. It reports whether `.env` exists, whether a token is configured, where raw/DuckDB/report/snapshot/recommendations artifacts are expected, the latest manifest source/timestamp when available, and the next command to run. It does not call Yandex Music and does not print token values. + +The Docker path uses the `local` profile. `make up-local` loads `.env` through `scripts/run_with_dotenv.py` before invoking Docker Compose, so token values are passed through the process environment instead of Make parsing. The one-shot `streamify-local` service falls back to sample data when the token is empty and runs with `set -euo pipefail`, so a failed ingestion/dbt/quality step stops the stack instead of continuing on stale files. When a token is present, the readiness audit uses `--require-real`; otherwise it validates the deterministic sample path. The service runs ingestion, raw contract validation, dbt build, doctor, static report export, and readiness audit before the dashboard starts: + +```bash +make up-local +``` + +For an automated Docker smoke test that does not call a real account, run: + +```bash +make compose-smoke-local +``` + +The local dbt command is: + +```bash +cd dbt && dbt build --profiles-dir . --target local --select yamusic +``` + +For normal use, prefer `make dbt-build`: it runs `dbt deps` first, so a fresh checkout does not depend on an existing ignored `dbt/dbt_packages` directory. + +## Data Flow + +1. `yamusic_ingest` writes normalized JSONL files plus `_manifest.json` into `data/raw/yamusic`. +2. dbt DuckDB reads those JSONL files with typed `read_json` schemas so empty/private account files still compile predictably. +3. Staging models deduplicate tracks, artists, albums, playlists, playlist membership and library events. +4. Mart models produce track, artist, album, playlist, library-event, affinity, period, genre, genre-period, overlap and track-signal tables. +5. `dashboard/app.py` reads `data/streamify.duckdb` and presents the self-analytics workspace. +6. `scripts/export_yamusic_summary.py` writes `data/streamify_summary.md` for a portable answer-first summary. +7. `scripts/export_yamusic_snapshot.py` writes `data/streamify_snapshot.json` for automation, CI artifacts and downstream agent workflows. +8. `scripts/export_yamusic_recommendations.py` writes practical CSV queues into `data/recommendations`. + +See [Yandex Music Local Lineage](yamusic_lineage.md) for the raw-to-dashboard model catalog and product-question mapping. + +## Real Account Acceptance Check + +After setting `YANDEX_MUSIC_TOKEN`, a successful real-account run should satisfy this checklist: + +- `make acceptance-real` completes end-to-end; +- `make ingest` exits successfully without printing the token; +- `make preflight` returns Yandex Music access counts without writing raw data or printing the token; +- `data/raw/yamusic/tracks.jsonl` exists and contains account metadata rows, or the CLI clearly reports that the account/API returned no rows; +- `make dbt-build` completes with the `local` DuckDB target; +- `make dashboard` opens and shows either non-empty metrics or a clear no-data state; +- no audio files are created under `data/`. + +Empty/private accounts are a supported state. The dbt smoke test builds against empty raw JSONL files, and the dashboard shows `No Yandex Music library metadata was returned for this run.` when `yamusic_library_profile.total_tracks` is zero. + +## Raw Datasets + +- `tracks.jsonl`: track metadata, album fields, artist arrays and liked flag. +- `artists.jsonl`: normalized artist metadata discovered through tracks and account-visible liked artists. +- `albums.jsonl`: normalized album metadata discovered through tracks and account-visible liked albums. +- `playlists.jsonl`: owned playlist metadata plus account-visible liked playlist metadata and declared track counts when available. +- `playlist_tracks.jsonl`: playlist-track membership. +- `user_library_events.jsonl`: derived events for liked tracks and playlist membership. +- `_manifest.json`: source, generated timestamp, adapter/client metadata, row counts, JSONL checksums and output paths; it must not contain token material. +- `_manifest.json.diagnostics`: aggregate skip/fallback counters for liked-track hydration, liked album/artist/playlist metadata, playlist hydration, playlist-track hydration, missing IDs, duplicate liked shortcuts and duplicate playlist-track memberships; these are counts only and do not store skipped object identifiers. A `*_fetch_failed` counter means full metadata enrichment failed after retries, not necessarily that the library row was dropped. + +## Product Answers + +The local marts are designed around practical self-analytics questions: + +- favorite artists and tracks: `yamusic_artist_affinity`, `yamusic_dim_tracks`; +- genre shifts and diversity: `yamusic_genre_profile`, `yamusic_genre_periods`, `yamusic_period_activity`; +- repeated patterns: `yamusic_track_signals.repeat_signal`; +- active periods: `yamusic_period_activity`; +- underrated tracks: liked tracks with low playlist coverage in `yamusic_track_signals`; +- underrated playlists: high-uniqueness, low-overlap playlists in `yamusic_playlist_signals`; +- playlist overlap: pairwise Jaccard similarity in `yamusic_playlist_overlap`. + +The dashboard includes sidebar filters for genre, liked state and track/artist/album search. These filters apply to track-level discovery views such as the library snapshot and repeated/underrated tracks. The Actions tab turns the marts into next-step queues: real-account/data-quality actions, rediscovery tracks, playlist cleanup candidates, standout playlists, and download buttons for the markdown summary, JSON snapshot and recommendations CSV files. + +`make report` exports the same marts into two portable artifacts: + +- `data/streamify_summary.md`: executive summary, top artists, genre shifts, repeat signals, underrated tracks, underrated playlists, next steps and caveats. +- `data/streamify_snapshot.json`: schema-versioned JSON with profile metrics, raw counts, ingestion diagnostics, favorite artists/tracks, genre shifts, active periods, repeat tracks, playlist overlap, underrated candidates and next actions. +- `data/recommendations/*.csv`: spreadsheet-friendly exports for top artists, rediscovery tracks, playlist cleanup, standout playlists and genre shifts. + +## Data Quality + +The local dbt layer checks: + +- non-null and unique track, artist, playlist and event keys; +- playlist-track relationships back to playlist and track dimensions; +- accepted values for derived event types; +- duplicate control through staging dedupe; +- stale Parquet cleanup on empty or `--json-only` ingestion reruns, so raw metadata outputs reflect the latest run rather than leftovers from an earlier run. +- genre-period uniqueness by month and genre in `yamusic_genre_periods`. +- track-signal checks for repeat and underrated flags. +- stale ingestion visibility through `yamusic_library_profile.stale_ingestion_flag`, which is raised when the newest local library event is older than 168 hours or missing. +- bounded retries on external Yandex Music client calls, with unit coverage for transient preflight and track-fetch failures. +- raw JSONL checksums and ingestion diagnostics in the manifest, DuckDB profile, readiness JSON, dashboard Data Quality tab, JSON snapshot and static report. + +The legacy BigQuery/Airflow quality contract remains intact and is still checked by `scripts/validate_dbt_quality.py`. + +`scripts/check_no_local_sensitive_artifacts.py` fails when root `.env`, local Yandex raw data, DuckDB warehouse files, or audio files under `data/` are tracked by git. `.env.example` remains safe to commit because it contains an empty token placeholder. + +`make raw-contract` validates the bronze/raw JSONL shape before dbt reads it: required fields, basic JSON types, allowed `source` values, accepted library event types, manifest row counts, JSONL sha256 checksums, ingestion diagnostics consistency, unique IDs, and playlist/event referential integrity. Empty/private accounts remain valid when the files exist and contain zero rows. + +`make doctor` runs local acceptance checks against the latest raw metadata and DuckDB marts: manifest row counts, JSONL validity, required mart tables, one-row library profile, no missing self-analytics tables, source/raw-count alignment between `_manifest.json` and `yamusic_library_profile`, and non-empty signal tables when account data exists. + +`make readiness` emits a JSON readiness summary for the current local product artifact: source type, raw row counts, DuckDB profile counts, report path, snapshot path, recommendations directory, no-audio status, stale-dbt protection, and whether the latest run proves real-account ingestion. `make readiness-real` uses the same audit with `--require-real` and fails unless `_manifest.json` declares `source=yandex_music`. + +`make acceptance-real` is the final real-account gate. It runs `make preflight`, `make ingest`, raw contract validation, dbt deps/build, `make doctor`, `make report`, `make readiness-real`, and `make dashboard-smoke` against the configured `YANDEX_MUSIC_TOKEN`. + +`make dashboard-smoke` runs a Streamlit content smoke with `streamlit.testing.v1.AppTest` to verify the expected self-analytics title, metrics, tabs, sections, dataframes and Data Quality JSON block. It then starts Streamlit against the local DuckDB file and verifies that the dashboard returns HTTP 200. Browser QA is still useful for visual regressions, but this gives a fast command-line guard for both content and app startup. + +`make compose-smoke-local` runs the Docker Compose `local` profile with a deterministic empty token, verifies that the dashboard returns HTTP 200 after the one-shot ingestion/dbt/doctor/report/readiness service succeeds, then validates the mounted raw contract, readiness JSON, product-answer exports and dashboard content smoke before tearing the stack down. + +## Limitations + +Yandex Music does not provide a stable public API for every analytics use case. This project isolates integration behind `yamusic_ingest/yandex_client.py` and uses the unofficial `yandex-music` Python package. Available fields can differ by account, region, subscription state, and library visibility. + +If the real integration returns less data than expected, use `make ingest-sample` to verify the local pipeline and dashboard independently from account access. + +## Reset + +```bash +make clean-local +make ingest-sample +make dbt-build +``` + +`make clean-local` removes generated raw metadata, DuckDB databases, static summary/snapshot reports, recommendations CSV files, dbt `target`/`logs`/`dbt_packages`, and smoke-test artifacts such as `streamify_empty_smoke`. It does not remove `.env`, token configuration, source files, or documentation. diff --git a/local/Dockerfile b/local/Dockerfile new file mode 100644 index 0000000..02200ad --- /dev/null +++ b/local/Dockerfile @@ -0,0 +1,16 @@ +FROM python:3.12-slim + +WORKDIR /app + +RUN apt-get update \ + && apt-get install -y --no-install-recommends build-essential git make \ + && rm -rf /var/lib/apt/lists/* + +COPY requirements.txt . +RUN pip install --no-cache-dir -r requirements.txt + +COPY . . + +ENV STREAMIFY_DATA_DIR=data \ + STREAMIFY_RAW_DIR=data/raw/yamusic \ + STREAMIFY_DUCKDB_PATH=data/streamify.duckdb diff --git a/requirements.txt b/requirements.txt index fec8cfb..c6e2224 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,2 +1,9 @@ kafka-python==1.4.6 -dbt-bigquery==1.0.0 \ No newline at end of file +dbt-bigquery==1.9.1 +yandex-music==2.2.0 +duckdb==1.1.3 +dbt-duckdb==1.9.1 +pandas==2.2.3 +pyarrow==18.1.0 +streamlit==1.41.1 +pytest==8.3.4 diff --git a/scripts/audit_yamusic_readiness.py b/scripts/audit_yamusic_readiness.py new file mode 100644 index 0000000..89e368a --- /dev/null +++ b/scripts/audit_yamusic_readiness.py @@ -0,0 +1,334 @@ +#!/usr/bin/env python3 +from __future__ import annotations + +import argparse +import csv +import json +import os +import sys +from pathlib import Path +from typing import Any + +import duckdb + +ROOT = Path(__file__).resolve().parents[1] +sys.path.insert(0, str(ROOT)) + +from yamusic_ingest.config import load_dotenv + +load_dotenv(ROOT / ".env") +RAW_DIR = ROOT / os.getenv("STREAMIFY_RAW_DIR", "data/raw/yamusic") +DUCKDB_PATH = ROOT / os.getenv("STREAMIFY_DUCKDB_PATH", "data/streamify.duckdb") +REPORT_PATH = ROOT / os.getenv("STREAMIFY_REPORT_PATH", "data/streamify_summary.md") +SNAPSHOT_PATH = ROOT / os.getenv("STREAMIFY_SNAPSHOT_PATH", "data/streamify_snapshot.json") +RECOMMENDATIONS_DIR = ROOT / os.getenv("STREAMIFY_RECOMMENDATIONS_DIR", "data/recommendations") + +REQUIRED_RAW_DATASETS = [ + "tracks", + "artists", + "albums", + "playlists", + "playlist_tracks", + "user_library_events", +] +REQUIRED_MARTS = [ + "stg_yamusic_manifest", + "yamusic_dim_tracks", + "yamusic_dim_artists", + "yamusic_dim_albums", + "yamusic_dim_playlists", + "yamusic_fact_library_events", + "yamusic_fact_playlist_tracks", + "yamusic_artist_affinity", + "yamusic_library_profile", + "yamusic_track_signals", + "yamusic_period_activity", + "yamusic_genre_profile", + "yamusic_genre_periods", + "yamusic_playlist_overlap", + "yamusic_playlist_signals", +] +AUDIO_EXTENSIONS = { + ".aac", + ".aiff", + ".alac", + ".flac", + ".m4a", + ".mp3", + ".ogg", + ".opus", + ".wav", + ".wma", +} + + +def fail(message: str) -> None: + raise AssertionError(message) + + +def read_manifest() -> dict[str, Any]: + path = RAW_DIR / "_manifest.json" + if not path.exists(): + fail(f"Missing ingestion manifest: {path}") + manifest = json.loads(path.read_text(encoding="utf-8")) + if manifest.get("source") not in {"sample", "yandex_music"}: + fail("Manifest source must be sample or yandex_music") + if "token" in json.dumps(manifest).lower(): + fail("Manifest must not contain token material") + return manifest + + +def count_jsonl(path: Path) -> int: + rows = 0 + with path.open(encoding="utf-8") as file: + for line in file: + if line.strip(): + json.loads(line) + rows += 1 + return rows + + +def audit_raw(manifest: dict[str, Any]) -> dict[str, int]: + datasets = manifest.get("datasets") + if not isinstance(datasets, dict): + fail("Manifest datasets must be an object") + counts: dict[str, int] = {} + for dataset in REQUIRED_RAW_DATASETS: + path = RAW_DIR / f"{dataset}.jsonl" + if not path.exists(): + fail(f"Missing raw JSONL dataset: {path}") + row_count = count_jsonl(path) + manifest_count = datasets.get(dataset, {}).get("row_count") + if manifest_count != row_count: + fail(f"Manifest row count mismatch for {dataset}: manifest={manifest_count}, actual={row_count}") + counts[dataset] = row_count + return counts + + +def audit_duckdb(manifest: dict[str, Any], raw_counts: dict[str, int]) -> dict[str, Any]: + if not DUCKDB_PATH.exists(): + fail(f"Missing local DuckDB database: {DUCKDB_PATH}") + with duckdb.connect(str(DUCKDB_PATH), read_only=True) as conn: + tables = { + row[0] + for row in conn.execute( + "select table_name from information_schema.tables where table_schema = 'main'" + ).fetchall() + } + missing = sorted(set(REQUIRED_MARTS) - tables) + if missing: + fail(f"Missing local mart tables: {', '.join(missing)}") + profile = conn.execute("select * from yamusic_library_profile").fetchdf() + if len(profile.index) != 1: + fail(f"yamusic_library_profile must contain exactly one row, found {len(profile.index)}") + row = profile.iloc[0] + total_tracks = int(row["total_tracks"] or 0) + stale_ingestion_flag = int(row["stale_ingestion_flag"] or 0) + if stale_ingestion_flag not in {0, 1}: + fail("stale_ingestion_flag must be 0 or 1") + manifest_source = str(row["manifest_source"]) + if manifest_source != manifest["source"]: + fail(f"DuckDB profile source {manifest_source!r} does not match manifest source {manifest['source']!r}; rerun make dbt-build") + adapter = { + "adapter_name": str(row["adapter_name"]), + "adapter_version": str(row["adapter_version"]), + "client_library": str(row["client_library"]), + "client_library_version": None if row["client_library_version"] is None else str(row["client_library_version"]), + } + for field in ["adapter_name", "adapter_version", "client_library"]: + if not adapter[field]: + fail(f"DuckDB profile adapter metadata field {field} must not be empty") + profile_raw_counts = { + "tracks": int(row["raw_tracks"] or 0), + "artists": int(row["raw_artists"] or 0), + "albums": int(row["raw_albums"] or 0), + "playlists": int(row["raw_playlists"] or 0), + "playlist_tracks": int(row["raw_playlist_tracks"] or 0), + "user_library_events": int(row["raw_user_library_events"] or 0), + } + if profile_raw_counts != raw_counts: + fail(f"DuckDB profile raw counts {profile_raw_counts} do not match manifest raw counts {raw_counts}; rerun make dbt-build") + profile_raw_checksums = { + "tracks": str(row["raw_tracks_sha256"]), + "artists": str(row["raw_artists_sha256"]), + "albums": str(row["raw_albums_sha256"]), + "playlists": str(row["raw_playlists_sha256"]), + "playlist_tracks": str(row["raw_playlist_tracks_sha256"]), + "user_library_events": str(row["raw_user_library_events_sha256"]), + } + manifest_checksums = { + dataset: str(manifest["datasets"][dataset]["jsonl_sha256"]) + for dataset in REQUIRED_RAW_DATASETS + } + if profile_raw_checksums != manifest_checksums: + fail("DuckDB profile raw checksums do not match manifest checksums; rerun make dbt-build") + diagnostics = { + "liked_shortcuts_seen": int(row["diagnostic_liked_shortcuts_seen"] or 0), + "liked_tracks_written": int(row["diagnostic_liked_tracks_written"] or 0), + "liked_shortcuts_fetch_failed": int(row["diagnostic_liked_shortcuts_fetch_failed"] or 0), + "liked_shortcuts_missing_track_id": int(row["diagnostic_liked_shortcuts_missing_track_id"] or 0), + "liked_tracks_duplicate_skipped": int(row["diagnostic_liked_tracks_duplicate_skipped"] or 0), + "liked_albums_seen": int(row["diagnostic_liked_albums_seen"] or 0), + "liked_albums_written": int(row["diagnostic_liked_albums_written"] or 0), + "liked_albums_missing_id": int(row["diagnostic_liked_albums_missing_id"] or 0), + "liked_albums_duplicate_skipped": int(row["diagnostic_liked_albums_duplicate_skipped"] or 0), + "liked_artists_seen": int(row["diagnostic_liked_artists_seen"] or 0), + "liked_artists_written": int(row["diagnostic_liked_artists_written"] or 0), + "liked_artists_missing_id": int(row["diagnostic_liked_artists_missing_id"] or 0), + "liked_artists_duplicate_skipped": int(row["diagnostic_liked_artists_duplicate_skipped"] or 0), + "liked_playlists_seen": int(row["diagnostic_liked_playlists_seen"] or 0), + "liked_playlists_written": int(row["diagnostic_liked_playlists_written"] or 0), + "liked_playlists_missing_id": int(row["diagnostic_liked_playlists_missing_id"] or 0), + "liked_playlists_duplicate_skipped": int(row["diagnostic_liked_playlists_duplicate_skipped"] or 0), + "playlists_seen": int(row["diagnostic_playlists_seen"] or 0), + "playlists_written": int(row["diagnostic_playlists_written"] or 0), + "playlists_missing_id": int(row["diagnostic_playlists_missing_id"] or 0), + "playlist_fetch_fallbacks": int(row["diagnostic_playlist_fetch_fallbacks"] or 0), + "playlist_tracks_seen": int(row["diagnostic_playlist_tracks_seen"] or 0), + "playlist_tracks_written": int(row["diagnostic_playlist_tracks_written"] or 0), + "playlist_tracks_fetch_failed": int(row["diagnostic_playlist_tracks_fetch_failed"] or 0), + "playlist_tracks_missing_track_id": int(row["diagnostic_playlist_tracks_missing_track_id"] or 0), + "playlist_tracks_duplicate_skipped": int(row["diagnostic_playlist_tracks_duplicate_skipped"] or 0), + } + return { + "manifest_source": manifest_source, + "manifest_generated_at": str(row["manifest_generated_at"]), + "adapter": adapter, + "ingestion_diagnostics": diagnostics, + "raw_counts_from_profile": profile_raw_counts, + "raw_checksums_from_profile": profile_raw_checksums, + "total_tracks": total_tracks, + "liked_tracks": int(row["liked_tracks"] or 0), + "artists": int(row["artists"] or 0), + "playlists": int(row["playlists"] or 0), + "known_genres": int(row["known_genres"] or 0), + "stale_ingestion_flag": stale_ingestion_flag, + } + + +def audit_report() -> None: + if not REPORT_PATH.exists(): + fail(f"Missing markdown self-analytics report: {REPORT_PATH}") + text = REPORT_PATH.read_text(encoding="utf-8") + for marker in [ + "Streamify Yandex Music Self-Analytics Summary", + "Executive Summary", + "Recommended Next Steps", + "Caveats And Assumptions", + ]: + if marker not in text: + fail(f"Report must contain {marker!r}") + token = os.getenv("YANDEX_MUSIC_TOKEN") + if token and token in text: + fail("Report must not contain the configured Yandex Music token value") + + +def audit_snapshot(manifest: dict[str, Any]) -> None: + if not SNAPSHOT_PATH.exists(): + fail(f"Missing JSON self-analytics snapshot: {SNAPSHOT_PATH}") + snapshot_text = SNAPSHOT_PATH.read_text(encoding="utf-8") + snapshot = json.loads(snapshot_text) + if snapshot.get("schema_version") != "1.0": + fail("Snapshot schema_version must be 1.0") + if snapshot.get("source") != manifest["source"]: + fail(f"Snapshot source {snapshot.get('source')!r} does not match manifest source {manifest['source']!r}") + if not isinstance(snapshot.get("answers"), dict): + fail("Snapshot must contain an answers object") + for key in ["favorite_artists", "favorite_tracks", "genre_shifts", "repeat_tracks", "playlist_overlap"]: + if key not in snapshot["answers"]: + fail(f"Snapshot answers must contain {key!r}") + quality = snapshot.get("quality") + if not isinstance(quality, dict): + fail("Snapshot must contain a quality object") + for key in ["raw_counts", "raw_checksums", "ingestion_diagnostics", "adapter"]: + if key not in quality: + fail(f"Snapshot quality must contain {key!r}") + for dataset, value in quality["raw_checksums"].items(): + if not isinstance(value, str) or len(value) != 64: + fail(f"Snapshot raw checksum for {dataset} must be a 64-character sha256 digest") + token = os.getenv("YANDEX_MUSIC_TOKEN") + if token and token in snapshot_text: + fail("Snapshot must not contain the configured Yandex Music token value") + + +def audit_recommendations() -> None: + expected_files = [ + "top_artists.csv", + "rediscovery_tracks.csv", + "playlist_cleanup.csv", + "standout_playlists.csv", + "genre_shifts.csv", + ] + if not RECOMMENDATIONS_DIR.exists(): + fail(f"Missing recommendations export directory: {RECOMMENDATIONS_DIR}") + token = os.getenv("YANDEX_MUSIC_TOKEN") + for file_name in expected_files: + path = RECOMMENDATIONS_DIR / file_name + if not path.exists(): + fail(f"Missing recommendations export: {path}") + text = path.read_text(encoding="utf-8") + if token and token in text: + fail(f"Recommendation export {file_name} must not contain the configured Yandex Music token value") + with path.open(encoding="utf-8", newline="") as file: + rows = list(csv.reader(file)) + if not rows or not rows[0]: + fail(f"Recommendation export {file_name} must contain a header row") + + +def audit_no_audio() -> None: + data_dir = ROOT / "data" + if not data_dir.exists(): + return + audio_files = [path for path in data_dir.rglob("*") if path.is_file() and path.suffix.lower() in AUDIO_EXTENSIONS] + if audio_files: + fail(f"Audio files must not be stored under data/: {audio_files[0]}") + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(description="Audit local Yandex Music product readiness.") + parser.add_argument( + "--require-real", + action="store_true", + help="Fail unless the latest raw manifest was produced from real Yandex Music metadata.", + ) + return parser.parse_args() + + +def main() -> int: + args = parse_args() + manifest = read_manifest() + raw_counts = audit_raw(manifest) + profile = audit_duckdb(manifest, raw_counts) + audit_report() + audit_snapshot(manifest) + audit_recommendations() + audit_no_audio() + real_account_verified = manifest["source"] == "yandex_music" + if args.require_real and not real_account_verified: + fail("Real-account readiness requires _manifest.json source=yandex_music. Run make acceptance-real with a valid YANDEX_MUSIC_TOKEN.") + + summary = { + "source": manifest["source"], + "raw_dir": str(RAW_DIR), + "duckdb_path": str(DUCKDB_PATH), + "report_path": str(REPORT_PATH), + "snapshot_path": str(SNAPSHOT_PATH), + "recommendations_dir": str(RECOMMENDATIONS_DIR), + "raw_counts": raw_counts, + "profile": profile, + "real_account_verified": real_account_verified, + } + print(json.dumps(summary, ensure_ascii=False, indent=2, sort_keys=True)) + if manifest["source"] == "sample": + print("OK: local product readiness is valid for sample metadata. Real-account acceptance still requires make acceptance-real.") + else: + print("OK: local product readiness is valid for Yandex Music metadata.") + return 0 + + +if __name__ == "__main__": + try: + raise SystemExit(main()) + except (AssertionError, json.JSONDecodeError, duckdb.Error) as error: + print(f"ERROR: {error}", file=sys.stderr) + raise SystemExit(1) diff --git a/scripts/build_pages_site.py b/scripts/build_pages_site.py new file mode 100644 index 0000000..3e86465 --- /dev/null +++ b/scripts/build_pages_site.py @@ -0,0 +1,120 @@ +#!/usr/bin/env python3 +from __future__ import annotations + +from html import escape +from pathlib import Path + +ROOT = Path(__file__).resolve().parents[1] +PUBLIC_DIR = ROOT / "public" + +PAGES = [ + ("README", ROOT / "README.md"), + ("Local Runbook", ROOT / "docs" / "yandex_music_local.md"), + ("Lineage", ROOT / "docs" / "yamusic_lineage.md"), + ("Acceptance", ROOT / "docs" / "product_acceptance.md"), + ("Project Management", ROOT / "docs" / "project_management.md"), + ("Release Process", ROOT / "docs" / "release_process.md"), + ("Sample Summary", ROOT / "data" / "streamify_summary.md"), +] + + +def markdown_to_html(markdown: str) -> str: + body: list[str] = [] + in_list = False + in_code = False + code_lines: list[str] = [] + for raw_line in markdown.splitlines(): + line = raw_line.rstrip() + if line.startswith("```"): + if in_code: + body.append("
" + escape("\n".join(code_lines)) + "
") + code_lines = [] + in_code = False + else: + if in_list: + body.append("") + in_list = False + in_code = True + continue + if in_code: + code_lines.append(line) + continue + if not line: + if in_list: + body.append("") + in_list = False + continue + if line.startswith("# "): + if in_list: + body.append("") + in_list = False + body.append(f"

{escape(line[2:])}

") + elif line.startswith("## "): + if in_list: + body.append("") + in_list = False + body.append(f"

{escape(line[3:])}

") + elif line.startswith("### "): + if in_list: + body.append("") + in_list = False + body.append(f"

{escape(line[4:])}

") + elif line.startswith("- "): + if not in_list: + body.append("") + in_list = False + body.append(f"

{escape(line)}

") + if in_code: + body.append("
" + escape("\n".join(code_lines)) + "
") + if in_list: + body.append("") + return "\n".join(body) + + +def page_html(title: str, body: str) -> str: + return f""" + + + + + {escape(title)} | Streamify + + + +
+

Streamify

+ +
+
{body}
+ + +""" + + +def main() -> int: + PUBLIC_DIR.mkdir(parents=True, exist_ok=True) + filenames = ["index.html", "runbook.html", "lineage.html", "acceptance.html", "management.html", "release.html", "sample-summary.html"] + for (title, path), filename in zip(PAGES, filenames): + markdown = path.read_text(encoding="utf-8") if path.exists() else f"# {title}\n\nRun `make report` to generate this page." + (PUBLIC_DIR / filename).write_text(page_html(title, markdown_to_html(markdown)), encoding="utf-8") + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/scripts/check_no_audio_artifacts.py b/scripts/check_no_audio_artifacts.py new file mode 100644 index 0000000..1a31308 --- /dev/null +++ b/scripts/check_no_audio_artifacts.py @@ -0,0 +1,47 @@ +#!/usr/bin/env python3 +from __future__ import annotations + +import sys +from pathlib import Path + + +ROOT = Path(__file__).resolve().parents[1] +DATA_DIR = ROOT / "data" +AUDIO_EXTENSIONS = { + ".aac", + ".aiff", + ".alac", + ".flac", + ".m4a", + ".mp3", + ".ogg", + ".opus", + ".wav", + ".wma", +} + + +def main() -> int: + if not DATA_DIR.exists(): + print("OK: data directory is absent; no audio artifacts found.") + return 0 + + audio_files = [ + path.relative_to(ROOT) + for path in DATA_DIR.rglob("*") + if path.is_file() and path.suffix.lower() in AUDIO_EXTENSIONS + ] + if audio_files: + print("ERROR: Streamify local mode must not store audio files.", file=sys.stderr) + for path in audio_files[:25]: + print(f"- {path}", file=sys.stderr) + if len(audio_files) > 25: + print(f"... and {len(audio_files) - 25} more", file=sys.stderr) + return 1 + + print("OK: no audio artifacts found under data/.") + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/scripts/check_no_local_sensitive_artifacts.py b/scripts/check_no_local_sensitive_artifacts.py new file mode 100644 index 0000000..821f5a5 --- /dev/null +++ b/scripts/check_no_local_sensitive_artifacts.py @@ -0,0 +1,76 @@ +#!/usr/bin/env python3 +from __future__ import annotations + +import subprocess +import sys +from pathlib import Path + + +ROOT = Path(__file__).resolve().parents[1] +AUDIO_EXTENSIONS = { + ".aac", + ".aiff", + ".alac", + ".flac", + ".m4a", + ".mp3", + ".ogg", + ".opus", + ".wav", + ".wma", +} +FORBIDDEN_TRACKED_PATHS = { + ".env", + "data/raw/yamusic", + "data/streamify.duckdb", + "data/streamify.duckdb.wal", +} +REQUIRED_GITIGNORE_MARKERS = [ + ".env", + "data/", + "*.duckdb", + "*.duckdb.wal", +] + + +def git_ls_files() -> list[str]: + result = subprocess.run( + ["git", "ls-files"], + cwd=ROOT, + text=True, + capture_output=True, + check=True, + ) + return [line.strip() for line in result.stdout.splitlines() if line.strip()] + + +def main() -> int: + tracked_files = git_ls_files() + errors: list[str] = [] + + for path in tracked_files: + normalized = path.strip("/") + if normalized in FORBIDDEN_TRACKED_PATHS or any( + normalized.startswith(f"{forbidden}/") for forbidden in FORBIDDEN_TRACKED_PATHS + ): + errors.append(f"local sensitive artifact is tracked: {path}") + if normalized.startswith("data/") and Path(normalized).suffix.lower() in AUDIO_EXTENSIONS: + errors.append(f"audio file is tracked under data/: {path}") + + gitignore = (ROOT / ".gitignore").read_text(encoding="utf-8") + for marker in REQUIRED_GITIGNORE_MARKERS: + if marker not in gitignore: + errors.append(f".gitignore must contain {marker!r}") + + if errors: + print("ERROR: local product sensitive-artifact guard failed.", file=sys.stderr) + for error in errors: + print(f"- {error}", file=sys.stderr) + return 1 + + print("OK: no local Yandex Music secrets, raw data, DuckDB files, or audio artifacts are tracked.") + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/scripts/doctor_yamusic_local.py b/scripts/doctor_yamusic_local.py new file mode 100644 index 0000000..eb6cadb --- /dev/null +++ b/scripts/doctor_yamusic_local.py @@ -0,0 +1,194 @@ +#!/usr/bin/env python3 +from __future__ import annotations + +import json +import os +import sys +from pathlib import Path +from typing import Any + +import duckdb + +ROOT = Path(__file__).resolve().parents[1] +sys.path.insert(0, str(ROOT)) + +from yamusic_ingest.config import load_dotenv + +load_dotenv(ROOT / ".env") +RAW_DIR = ROOT / os.getenv("STREAMIFY_RAW_DIR", "data/raw/yamusic") +DUCKDB_PATH = ROOT / os.getenv("STREAMIFY_DUCKDB_PATH", "data/streamify.duckdb") + +REQUIRED_DATASETS = [ + "tracks", + "artists", + "albums", + "playlists", + "playlist_tracks", + "user_library_events", +] +REQUIRED_TABLES = [ + "stg_yamusic_manifest", + "yamusic_dim_tracks", + "yamusic_dim_artists", + "yamusic_dim_albums", + "yamusic_dim_playlists", + "yamusic_fact_library_events", + "yamusic_fact_playlist_tracks", + "yamusic_artist_affinity", + "yamusic_library_profile", + "yamusic_track_signals", + "yamusic_period_activity", + "yamusic_genre_profile", + "yamusic_genre_periods", + "yamusic_playlist_overlap", + "yamusic_playlist_signals", +] + + +def fail(message: str) -> None: + raise AssertionError(message) + + +def read_manifest() -> dict[str, Any]: + manifest_path = RAW_DIR / "_manifest.json" + if not manifest_path.exists(): + fail(f"Missing ingestion manifest: {manifest_path}") + manifest = json.loads(manifest_path.read_text(encoding="utf-8")) + if manifest.get("source") not in {"sample", "yandex_music"}: + fail("_manifest.json must declare source as sample or yandex_music") + if "token" in json.dumps(manifest).lower(): + fail("_manifest.json must not contain token material") + return manifest + + +def check_raw_files(manifest: dict[str, Any]) -> dict[str, int]: + datasets = manifest.get("datasets") + if not isinstance(datasets, dict): + fail("_manifest.json must contain datasets object") + counts: dict[str, int] = {} + for dataset in REQUIRED_DATASETS: + jsonl_path = RAW_DIR / f"{dataset}.jsonl" + if not jsonl_path.exists(): + fail(f"Missing raw dataset: {jsonl_path}") + rows = 0 + with jsonl_path.open(encoding="utf-8") as file: + for line_number, line in enumerate(file, start=1): + if not line.strip(): + continue + json.loads(line) + rows += 1 + manifest_count = datasets.get(dataset, {}).get("row_count") + if manifest_count != rows: + fail(f"Manifest row count mismatch for {dataset}: manifest={manifest_count}, actual={rows}") + counts[dataset] = rows + return counts + + +def scalar(conn: duckdb.DuckDBPyConnection, sql: str) -> Any: + return conn.execute(sql).fetchone()[0] + + +def check_duckdb(manifest: dict[str, Any], raw_counts: dict[str, int]) -> None: + if not DUCKDB_PATH.exists(): + fail(f"Missing local DuckDB database: {DUCKDB_PATH}") + with duckdb.connect(str(DUCKDB_PATH), read_only=True) as conn: + tables = { + row[0] + for row in conn.execute( + "select table_name from information_schema.tables where table_schema = 'main'" + ).fetchall() + } + missing = sorted(set(REQUIRED_TABLES) - tables) + if missing: + fail(f"Missing local mart tables: {', '.join(missing)}") + + profile_rows = scalar(conn, "select count(*) from yamusic_library_profile") + if profile_rows != 1: + fail(f"yamusic_library_profile must contain exactly one row, found {profile_rows}") + + profile = conn.execute( + """ + select + total_tracks, + manifest_source, + adapter_name, + adapter_version, + client_library, + raw_tracks, + raw_artists, + raw_albums, + raw_playlists, + raw_playlist_tracks, + raw_user_library_events, + playlists, + stale_ingestion_flag, + underrated_tracks, + underrated_playlists, + active_months + from yamusic_library_profile + """ + ).fetchone() + ( + total_tracks, + manifest_source, + adapter_name, + adapter_version, + client_library, + raw_tracks, + raw_artists, + raw_albums, + raw_playlists, + raw_playlist_tracks, + raw_user_library_events, + playlists, + stale_ingestion_flag, + _, + _, + active_months, + ) = profile + if manifest_source != manifest["source"]: + fail(f"DuckDB profile source {manifest_source!r} does not match manifest source {manifest['source']!r}; rerun make dbt-build") + for field_name, field_value in { + "adapter_name": adapter_name, + "adapter_version": adapter_version, + "client_library": client_library, + }.items(): + if not field_value: + fail(f"DuckDB profile adapter metadata field {field_name} must not be empty") + profile_raw_counts = { + "tracks": int(raw_tracks or 0), + "artists": int(raw_artists or 0), + "albums": int(raw_albums or 0), + "playlists": int(raw_playlists or 0), + "playlist_tracks": int(raw_playlist_tracks or 0), + "user_library_events": int(raw_user_library_events or 0), + } + if profile_raw_counts != raw_counts: + fail(f"DuckDB profile raw counts {profile_raw_counts} do not match manifest raw counts {raw_counts}; rerun make dbt-build") + if stale_ingestion_flag not in {0, 1}: + fail("stale_ingestion_flag must be 0 or 1") + if total_tracks > 0: + if scalar(conn, "select count(*) from yamusic_track_signals") == 0: + fail("yamusic_track_signals must be non-empty when tracks exist") + if active_months > 0 and scalar(conn, "select count(*) from yamusic_period_activity") == 0: + fail("yamusic_period_activity must be non-empty when activity months exist") + if active_months > 0 and scalar(conn, "select count(*) from yamusic_genre_periods") == 0: + fail("yamusic_genre_periods must be non-empty when activity months exist") + if playlists > 0 and scalar(conn, "select count(*) from yamusic_playlist_signals") == 0: + fail("yamusic_playlist_signals must be non-empty when playlists exist") + + +def main() -> int: + manifest = read_manifest() + raw_counts = check_raw_files(manifest) + check_duckdb(manifest, raw_counts) + print("OK: local Yandex Music acceptance checks passed.") + return 0 + + +if __name__ == "__main__": + try: + raise SystemExit(main()) + except (AssertionError, json.JSONDecodeError) as error: + print(f"ERROR: {error}", file=sys.stderr) + raise SystemExit(1) diff --git a/scripts/export_yamusic_recommendations.py b/scripts/export_yamusic_recommendations.py new file mode 100644 index 0000000..563ec0f --- /dev/null +++ b/scripts/export_yamusic_recommendations.py @@ -0,0 +1,102 @@ +#!/usr/bin/env python3 +from __future__ import annotations + +import csv +import os +import sys +from pathlib import Path +from typing import Any + +import duckdb + +ROOT = Path(__file__).resolve().parents[1] +sys.path.insert(0, str(ROOT)) + +from yamusic_ingest.config import load_dotenv + +load_dotenv(ROOT / ".env") +DEFAULT_DUCKDB_PATH = ROOT / "data" / "streamify.duckdb" +DEFAULT_EXPORT_DIR = ROOT / "data" / "recommendations" + + +EXPORT_QUERIES: dict[str, str] = { + "top_artists.csv": """ + select artist_name, track_count, liked_track_count, playlist_appearances, + avg_playlist_appearances_per_track + from yamusic_artist_affinity + order by track_count desc, liked_track_count desc, playlist_appearances desc, artist_name + limit 100 + """, + "rediscovery_tracks.csv": """ + select title, artist_display, album_title, genre, playlist_slots, playlist_count, + event_count, repeat_signal + from yamusic_track_signals + where underrated_flag = true + order by playlist_slots asc, playlist_count asc, repeat_signal desc, title + limit 250 + """, + "playlist_cleanup.csv": """ + select playlist_a_title, playlist_b_title, overlap_track_count, jaccard_overlap + from yamusic_playlist_overlap + order by jaccard_overlap desc, overlap_track_count desc, playlist_a_title, playlist_b_title + limit 250 + """, + "standout_playlists.csv": """ + select playlist_title, actual_track_count, unique_track_count, uniqueness_ratio, + max_overlap, overlapped_track_mentions + from yamusic_playlist_signals + where underrated_playlist_flag = true + order by uniqueness_ratio desc, actual_track_count desc, playlist_title + limit 250 + """, + "genre_shifts.csv": """ + select activity_month, genre, event_count, active_tracks, event_share_in_month + from yamusic_genre_periods + order by activity_month desc, event_share_in_month desc, genre + limit 500 + """, +} + + +def env_path(name: str, default: Path) -> Path: + value = os.getenv(name) + if not value: + return default + path = Path(value) + return path if path.is_absolute() else ROOT / path + + +def write_csv(path: Path, columns: list[str], rows: list[tuple[Any, ...]]) -> int: + path.parent.mkdir(parents=True, exist_ok=True) + with path.open("w", encoding="utf-8", newline="") as file: + writer = csv.writer(file) + writer.writerow(columns) + writer.writerows(rows) + return len(rows) + + +def export_query(connection: duckdb.DuckDBPyConnection, export_dir: Path, file_name: str, sql: str) -> int: + cursor = connection.execute(sql) + columns = [column[0] for column in cursor.description] + rows = cursor.fetchall() + return write_csv(export_dir / file_name, columns, rows) + + +def main() -> int: + duckdb_path = env_path("STREAMIFY_DUCKDB_PATH", DEFAULT_DUCKDB_PATH) + export_dir = env_path("STREAMIFY_RECOMMENDATIONS_DIR", DEFAULT_EXPORT_DIR) + if not duckdb_path.exists(): + raise SystemExit(f"Local DuckDB database is missing: {duckdb_path}. Run make dbt-build first.") + + with duckdb.connect(str(duckdb_path), read_only=True) as connection: + counts = { + file_name: export_query(connection, export_dir, file_name, sql) + for file_name, sql in EXPORT_QUERIES.items() + } + for file_name, row_count in counts.items(): + print(f"wrote {row_count:>5} rows to {export_dir / file_name}") + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/scripts/export_yamusic_snapshot.py b/scripts/export_yamusic_snapshot.py new file mode 100644 index 0000000..3214886 --- /dev/null +++ b/scripts/export_yamusic_snapshot.py @@ -0,0 +1,296 @@ +#!/usr/bin/env python3 +from __future__ import annotations + +import json +import os +import sys +from datetime import date, datetime, timezone +from decimal import Decimal +from pathlib import Path +from typing import Any + +import duckdb + +ROOT = Path(__file__).resolve().parents[1] +sys.path.insert(0, str(ROOT)) + +from yamusic_ingest.config import load_dotenv + +load_dotenv(ROOT / ".env") +DEFAULT_DUCKDB_PATH = ROOT / "data" / "streamify.duckdb" +DEFAULT_SNAPSHOT_PATH = ROOT / "data" / "streamify_snapshot.json" + + +def env_path(name: str, default: Path) -> Path: + value = os.getenv(name) + if not value: + return default + path = Path(value) + return path if path.is_absolute() else ROOT / path + + +def json_value(value: Any) -> Any: + if isinstance(value, datetime): + return value.isoformat() + if isinstance(value, date): + return value.isoformat() + if isinstance(value, Decimal): + return float(value) + if isinstance(value, Path): + return str(value) + return value + + +def normalize_row(row: dict[str, Any]) -> dict[str, Any]: + return {key: json_value(value) for key, value in row.items()} + + +def query_rows(connection: duckdb.DuckDBPyConnection, sql: str, limit: int | None = None) -> list[dict[str, Any]]: + if limit is not None: + sql = f"{sql.rstrip().rstrip(';')} limit {int(limit)}" + cursor = connection.execute(sql) + columns = [column[0] for column in cursor.description] + return [normalize_row(dict(zip(columns, row))) for row in cursor.fetchall()] + + +def scalar(connection: duckdb.DuckDBPyConnection, sql: str, params: tuple[Any, ...] = (), default: Any = None) -> Any: + rows = connection.execute(sql, params).fetchall() + if not rows: + return default + return rows[0][0] + + +def has_table(connection: duckdb.DuckDBPyConnection, table_name: str) -> bool: + return bool( + scalar( + connection, + """ + select count(*) + from information_schema.tables + where table_schema = 'main' + and table_name = ? + """, + (table_name,), + 0, + ) + ) + + +def required_tables_missing(connection: duckdb.DuckDBPyConnection) -> list[str]: + required_tables = [ + "yamusic_library_profile", + "yamusic_artist_affinity", + "yamusic_dim_tracks", + "yamusic_genre_profile", + "yamusic_genre_periods", + "yamusic_period_activity", + "yamusic_track_signals", + "yamusic_playlist_overlap", + "yamusic_playlist_signals", + ] + return [table for table in required_tables if not has_table(connection, table)] + + +def build_snapshot(connection: duckdb.DuckDBPyConnection) -> dict[str, Any]: + missing = required_tables_missing(connection) + if missing: + raise SystemExit(f"Missing required mart tables: {', '.join(missing)}. Run make dbt-build first.") + + profile_rows = query_rows(connection, "select * from yamusic_library_profile limit 1") + if not profile_rows: + raise SystemExit("yamusic_library_profile is empty. Run make dbt-build first.") + profile = profile_rows[0] + + diagnostics = { + "liked_shortcuts_seen": profile.get("diagnostic_liked_shortcuts_seen"), + "liked_tracks_written": profile.get("diagnostic_liked_tracks_written"), + "liked_shortcuts_fetch_failed": profile.get("diagnostic_liked_shortcuts_fetch_failed"), + "liked_shortcuts_missing_track_id": profile.get("diagnostic_liked_shortcuts_missing_track_id"), + "liked_tracks_duplicate_skipped": profile.get("diagnostic_liked_tracks_duplicate_skipped"), + "liked_albums_seen": profile.get("diagnostic_liked_albums_seen"), + "liked_albums_written": profile.get("diagnostic_liked_albums_written"), + "liked_albums_missing_id": profile.get("diagnostic_liked_albums_missing_id"), + "liked_albums_duplicate_skipped": profile.get("diagnostic_liked_albums_duplicate_skipped"), + "liked_artists_seen": profile.get("diagnostic_liked_artists_seen"), + "liked_artists_written": profile.get("diagnostic_liked_artists_written"), + "liked_artists_missing_id": profile.get("diagnostic_liked_artists_missing_id"), + "liked_artists_duplicate_skipped": profile.get("diagnostic_liked_artists_duplicate_skipped"), + "liked_playlists_seen": profile.get("diagnostic_liked_playlists_seen"), + "liked_playlists_written": profile.get("diagnostic_liked_playlists_written"), + "liked_playlists_missing_id": profile.get("diagnostic_liked_playlists_missing_id"), + "liked_playlists_duplicate_skipped": profile.get("diagnostic_liked_playlists_duplicate_skipped"), + "playlists_seen": profile.get("diagnostic_playlists_seen"), + "playlists_written": profile.get("diagnostic_playlists_written"), + "playlists_missing_id": profile.get("diagnostic_playlists_missing_id"), + "playlist_fetch_fallbacks": profile.get("diagnostic_playlist_fetch_fallbacks"), + "playlist_tracks_seen": profile.get("diagnostic_playlist_tracks_seen"), + "playlist_tracks_written": profile.get("diagnostic_playlist_tracks_written"), + "playlist_tracks_fetch_failed": profile.get("diagnostic_playlist_tracks_fetch_failed"), + "playlist_tracks_missing_track_id": profile.get("diagnostic_playlist_tracks_missing_track_id"), + "playlist_tracks_duplicate_skipped": profile.get("diagnostic_playlist_tracks_duplicate_skipped"), + } + raw_counts = { + "tracks": profile.get("raw_tracks"), + "artists": profile.get("raw_artists"), + "albums": profile.get("raw_albums"), + "playlists": profile.get("raw_playlists"), + "playlist_tracks": profile.get("raw_playlist_tracks"), + "user_library_events": profile.get("raw_user_library_events"), + } + raw_checksums = { + "tracks": profile.get("raw_tracks_sha256"), + "artists": profile.get("raw_artists_sha256"), + "albums": profile.get("raw_albums_sha256"), + "playlists": profile.get("raw_playlists_sha256"), + "playlist_tracks": profile.get("raw_playlist_tracks_sha256"), + "user_library_events": profile.get("raw_user_library_events_sha256"), + } + + answers = { + "favorite_artists": query_rows( + connection, + """ + select artist_name, track_count, liked_track_count, playlist_appearances, + avg_playlist_appearances_per_track + from yamusic_artist_affinity + order by track_count desc, liked_track_count desc, artist_name + """, + 20, + ), + "favorite_tracks": query_rows( + connection, + """ + select title, artist_display, genre, liked, playlist_count + from yamusic_track_signals + order by liked desc, playlist_count desc, title + """, + 20, + ), + "genre_profile": query_rows( + connection, + """ + select genre, track_count, liked_track_count, library_hours, track_share + from yamusic_genre_profile + order by track_count desc, liked_track_count desc, genre + """, + 20, + ), + "genre_shifts": query_rows( + connection, + """ + select activity_month, genre, event_count, active_tracks, event_share_in_month + from yamusic_genre_periods + order by activity_month desc, event_count desc, genre + """, + 24, + ), + "active_periods": query_rows( + connection, + """ + select activity_month, event_count, liked_events, playlist_events, + active_tracks, active_artists + from yamusic_period_activity + order by activity_month desc + """, + 24, + ), + "repeat_tracks": query_rows( + connection, + """ + select title, artist_display, genre, playlist_slots, playlist_count, repeat_signal + from yamusic_track_signals + where repeat_signal > 0 + order by repeat_signal desc, playlist_slots desc, title + """, + 20, + ), + "underrated_tracks": query_rows( + connection, + """ + select title, artist_display, genre, playlist_slots, playlist_count + from yamusic_track_signals + where underrated_flag = true + order by playlist_slots asc, title + """, + 20, + ), + "playlist_overlap": query_rows( + connection, + """ + select playlist_a_title, playlist_b_title, overlap_track_count, jaccard_overlap + from yamusic_playlist_overlap + order by jaccard_overlap desc, overlap_track_count desc, + playlist_a_title, playlist_b_title + """, + 20, + ), + "underrated_playlists": query_rows( + connection, + """ + select playlist_title, actual_track_count, unique_track_count, uniqueness_ratio, + max_overlap, overlapped_track_mentions + from yamusic_playlist_signals + where underrated_playlist_flag = true + order by uniqueness_ratio desc, actual_track_count desc, playlist_title + """, + 20, + ), + } + + source = str(profile.get("manifest_source") or "unknown") + total_tracks = int(profile.get("total_tracks") or 0) + stale = bool(profile.get("stale_ingestion_flag")) + next_actions = [] + if source != "yandex_music": + next_actions.append("Set YANDEX_MUSIC_TOKEN in .env and run make acceptance-real to replace sample metadata.") + if stale: + next_actions.append("Rerun ingestion or inspect timestamp availability because stale_ingestion_flag is true.") + if total_tracks == 0: + next_actions.append("Run make ingest-sample for deterministic demo data or verify that the Yandex Music account exposes library metadata.") + if not next_actions: + next_actions.append("Open make dashboard and use filters to inspect artists, genres, playlists and data quality.") + + return { + "schema_version": "1.0", + "generated_at": datetime.now(timezone.utc).isoformat(), + "product": "Streamify Yandex Music Self-Analytics", + "source": source, + "real_account_verified": source == "yandex_music", + "profile": profile, + "quality": { + "stale_ingestion_flag": profile.get("stale_ingestion_flag"), + "manifest_generated_at": profile.get("manifest_generated_at"), + "manifest_json_only": profile.get("manifest_json_only"), + "adapter": { + "adapter_name": profile.get("adapter_name"), + "adapter_version": profile.get("adapter_version"), + "client_library": profile.get("client_library"), + "client_library_version": profile.get("client_library_version"), + }, + "raw_counts": raw_counts, + "raw_checksums": raw_checksums, + "ingestion_diagnostics": diagnostics, + }, + "answers": answers, + "next_actions": next_actions, + } + + +def main() -> int: + duckdb_path = env_path("STREAMIFY_DUCKDB_PATH", DEFAULT_DUCKDB_PATH) + snapshot_path = env_path("STREAMIFY_SNAPSHOT_PATH", DEFAULT_SNAPSHOT_PATH) + if not duckdb_path.exists(): + raise SystemExit(f"Local DuckDB database is missing: {duckdb_path}. Run make dbt-build first.") + + snapshot_path.parent.mkdir(parents=True, exist_ok=True) + with duckdb.connect(str(duckdb_path), read_only=True) as connection: + snapshot_path.write_text( + json.dumps(build_snapshot(connection), ensure_ascii=False, indent=2, sort_keys=True), + encoding="utf-8", + ) + print(f"Wrote Yandex Music self-analytics JSON snapshot: {snapshot_path}") + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/scripts/export_yamusic_summary.py b/scripts/export_yamusic_summary.py new file mode 100644 index 0000000..3b05306 --- /dev/null +++ b/scripts/export_yamusic_summary.py @@ -0,0 +1,444 @@ +#!/usr/bin/env python3 +from __future__ import annotations + +import os +import sys +from datetime import datetime, timezone +from pathlib import Path +from typing import Any + +import duckdb + +ROOT = Path(__file__).resolve().parents[1] +sys.path.insert(0, str(ROOT)) + +from yamusic_ingest.config import load_dotenv + +load_dotenv(ROOT / ".env") +DEFAULT_DUCKDB_PATH = ROOT / "data" / "streamify.duckdb" +DEFAULT_REPORT_PATH = ROOT / "data" / "streamify_summary.md" + + +def env_path(name: str, default: Path) -> Path: + value = os.getenv(name) + if not value: + return default + path = Path(value) + return path if path.is_absolute() else ROOT / path + + +def query_rows(connection: duckdb.DuckDBPyConnection, sql: str, limit: int | None = None) -> list[dict[str, Any]]: + if limit is not None: + sql = f"{sql.rstrip().rstrip(';')} limit {int(limit)}" + cursor = connection.execute(sql) + columns = [column[0] for column in cursor.description] + return [dict(zip(columns, row)) for row in cursor.fetchall()] + + +def scalar(connection: duckdb.DuckDBPyConnection, sql: str, params: tuple[Any, ...] = (), default: Any = None) -> Any: + rows = connection.execute(sql, params).fetchall() + if not rows: + return default + return rows[0][0] + + +def has_table(connection: duckdb.DuckDBPyConnection, table_name: str) -> bool: + return bool( + scalar( + connection, + """ + select count(*) + from information_schema.tables + where table_schema = 'main' + and table_name = ? + """, + (table_name,), + 0, + ) + ) + + +def fmt(value: Any) -> str: + if value is None: + return "" + if isinstance(value, bool): + return "yes" if value else "no" + if isinstance(value, float): + if value.is_integer(): + return f"{int(value):,}" + return f"{value:,.2f}" + if isinstance(value, int): + return f"{value:,}" + return str(value).replace("|", "\\|").replace("\n", " ") + + +def pct(value: Any) -> str: + if value is None: + return "n/a" + try: + return f"{float(value) * 100:.1f}%" + except (TypeError, ValueError): + return "n/a" + + +def markdown_table(rows: list[dict[str, Any]], columns: list[tuple[str, str]], empty: str) -> str: + if not rows: + return empty + headers = [label for _, label in columns] + lines = [ + "| " + " | ".join(headers) + " |", + "| " + " | ".join("---" for _ in headers) + " |", + ] + for row in rows: + lines.append("| " + " | ".join(fmt(row.get(key)) for key, _ in columns) + " |") + return "\n".join(lines) + + +def profile_summary(profile: dict[str, Any]) -> list[str]: + manifest_source = str(profile.get("manifest_source") or "unknown") + manifest_generated_at = profile.get("manifest_generated_at") or "unknown" + total_tracks = int(profile.get("total_tracks") or 0) + artists = int(profile.get("artists") or 0) + playlists = int(profile.get("playlists") or 0) + library_hours = float(profile.get("library_hours") or 0) + concentration = pct(profile.get("top_artist_concentration")) + top_genre_share = pct(profile.get("top_genre_share")) + stale = bool(profile.get("stale_ingestion_flag")) + + if total_tracks == 0: + return [ + f"**The latest raw run source is `{manifest_source}` from {manifest_generated_at}.** Use this to distinguish deterministic sample data from real Yandex Music metadata.", + "**No account metadata is available yet.** The pipeline built successfully, but Yandex Music returned zero tracks or the current run used an empty fixture.", + "**The product path is still verifiable.** Run `make ingest-sample` for deterministic demo data, or set `YANDEX_MUSIC_TOKEN` in `.env` and run `make acceptance-real` for account metadata.", + "**Data freshness needs a real library event.** The stale flag remains active until ingestion returns timestamped liked-track or playlist metadata.", + ] + + freshness = "stale" if stale else "fresh" + return [ + f"**The latest raw run source is `{manifest_source}` from {manifest_generated_at}.** Use this to distinguish deterministic sample data from real Yandex Music metadata.", + f"**The library contains {total_tracks:,} tracks across {artists:,} artists and {playlists:,} playlists.** The local warehouse estimates about {library_hours:,.1f} hours of catalogued music metadata.", + f"**Taste concentration is {concentration} for the top artist and {top_genre_share} for the top genre.** Use those shares to judge whether recommendations are narrow or broad.", + f"**The latest ingestion health is {freshness}.** The dashboard and report are driven by the same DuckDB marts, so this summary is reproducible from the local data files.", + ] + + +def build_report(connection: duckdb.DuckDBPyConnection) -> str: + required_tables = [ + "yamusic_library_profile", + "yamusic_artist_affinity", + "yamusic_genre_periods", + "yamusic_track_signals", + "yamusic_playlist_signals", + ] + missing = [table for table in required_tables if not has_table(connection, table)] + if missing: + raise SystemExit(f"Missing required mart tables: {', '.join(missing)}. Run make dbt-build first.") + + profile_rows = query_rows(connection, "select * from yamusic_library_profile limit 1") + if not profile_rows: + raise SystemExit("yamusic_library_profile is empty. Run make dbt-build first.") + profile = profile_rows[0] + + top_artists = query_rows( + connection, + """ + select artist_name, track_count, liked_track_count, playlist_appearances, + avg_playlist_appearances_per_track + from yamusic_artist_affinity + order by track_count desc, liked_track_count desc, artist_name + """, + 10, + ) + genre_shifts = query_rows( + connection, + """ + select activity_month, genre, event_count, active_tracks, event_share_in_month + from yamusic_genre_periods + order by activity_month desc, event_count desc, genre + """, + 12, + ) + repeated_tracks = query_rows( + connection, + """ + select title, artist_display, genre, playlist_slots, playlist_count, repeat_signal + from yamusic_track_signals + where repeat_signal > 0 + order by repeat_signal desc, playlist_slots desc, title + """, + 10, + ) + underrated_tracks = query_rows( + connection, + """ + select title, artist_display, genre, playlist_slots, playlist_count + from yamusic_track_signals + where underrated_flag = true + order by playlist_slots asc, title + """, + 10, + ) + underrated_playlists = query_rows( + connection, + """ + select playlist_title, actual_track_count, unique_track_count, uniqueness_ratio, + max_overlap, overlapped_track_mentions + from yamusic_playlist_signals + where underrated_playlist_flag = true + order by uniqueness_ratio desc, actual_track_count desc, playlist_title + """, + 10, + ) + + generated_at = datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M UTC") + total_tracks = int(profile.get("total_tracks") or 0) + stale = bool(profile.get("stale_ingestion_flag")) + profile_display = dict(profile) + profile_display["stale_ingestion_flag"] = "yes" if stale else "no" + profile_display["manifest_json_only"] = "yes" if profile.get("manifest_json_only") else "no" + for row in genre_shifts: + row["event_share_in_month"] = pct(row.get("event_share_in_month")) + for row in underrated_playlists: + row["uniqueness_ratio"] = pct(row.get("uniqueness_ratio")) + row["max_overlap"] = pct(row.get("max_overlap")) + + lines = [ + "# Streamify Yandex Music Self-Analytics Summary", + "", + f"Generated: {generated_at}", + "", + "## Executive Summary", + "", + *[f"- {item}" for item in profile_summary(profile)], + "", + "## What The Local Library Looks Like", + "", + "These headline metrics come from `yamusic_library_profile`, the one-row mart that combines raw ingestion freshness, artist concentration, playlist coverage, genre availability and signal counts.", + "", + markdown_table( + [profile_display], + [ + ("total_tracks", "Tracks"), + ("liked_tracks", "Liked"), + ("artists", "Artists"), + ("playlists", "Playlists"), + ("library_hours", "Library hours"), + ("known_genres", "Known genres"), + ("active_months", "Active months"), + ("stale_ingestion_flag", "Stale"), + ("manifest_source", "Source"), + ], + "No profile row is available.", + ), + "", + "## Raw Ingestion Counts", + "", + "These counts come from `_manifest.json` through `stg_yamusic_manifest` and are copied into `yamusic_library_profile` to make stale dbt builds visible.", + "", + markdown_table( + [profile_display], + [ + ("manifest_generated_at", "Manifest generated"), + ("manifest_json_only", "JSON only"), + ("adapter_name", "Adapter"), + ("adapter_version", "Adapter version"), + ("client_library", "Client library"), + ("client_library_version", "Client version"), + ("raw_tracks", "Raw tracks"), + ("raw_artists", "Raw artists"), + ("raw_albums", "Raw albums"), + ("raw_playlists", "Raw playlists"), + ("raw_playlist_tracks", "Raw playlist tracks"), + ("raw_user_library_events", "Raw events"), + ], + "No raw manifest profile is available.", + ), + "", + "### Raw File Checksums", + "", + "These SHA256 checksums identify the exact JSONL files used for the local DuckDB build.", + "", + markdown_table( + [profile_display], + [ + ("raw_tracks_sha256", "Tracks"), + ("raw_artists_sha256", "Artists"), + ("raw_albums_sha256", "Albums"), + ("raw_playlists_sha256", "Playlists"), + ("raw_playlist_tracks_sha256", "Playlist tracks"), + ("raw_user_library_events_sha256", "Events"), + ], + "No raw checksums are available.", + ), + "", + "### Ingestion Diagnostics", + "", + "Diagnostics are aggregate counters only. They help identify partial Yandex Music API responses without storing skipped track, playlist or account identifiers.", + "", + markdown_table( + [profile_display], + [ + ("diagnostic_liked_shortcuts_seen", "Liked shortcuts seen"), + ("diagnostic_liked_tracks_written", "Liked tracks written"), + ("diagnostic_liked_shortcuts_fetch_failed", "Liked fetch failures"), + ("diagnostic_liked_shortcuts_missing_track_id", "Liked missing IDs"), + ("diagnostic_liked_tracks_duplicate_skipped", "Liked duplicates skipped"), + ("diagnostic_liked_albums_seen", "Liked albums seen"), + ("diagnostic_liked_albums_written", "Liked albums written"), + ("diagnostic_liked_albums_missing_id", "Liked albums missing IDs"), + ("diagnostic_liked_albums_duplicate_skipped", "Liked album duplicates skipped"), + ("diagnostic_liked_artists_seen", "Liked artists seen"), + ("diagnostic_liked_artists_written", "Liked artists written"), + ("diagnostic_liked_artists_missing_id", "Liked artists missing IDs"), + ("diagnostic_liked_artists_duplicate_skipped", "Liked artist duplicates skipped"), + ("diagnostic_liked_playlists_seen", "Liked playlists seen"), + ("diagnostic_liked_playlists_written", "Liked playlists written"), + ("diagnostic_liked_playlists_missing_id", "Liked playlists missing IDs"), + ("diagnostic_liked_playlists_duplicate_skipped", "Liked playlist duplicates skipped"), + ("diagnostic_playlists_seen", "Playlists seen"), + ("diagnostic_playlists_written", "Playlists written"), + ("diagnostic_playlists_missing_id", "Playlists missing IDs"), + ("diagnostic_playlist_fetch_fallbacks", "Playlist fetch fallbacks"), + ("diagnostic_playlist_tracks_seen", "Playlist tracks seen"), + ("diagnostic_playlist_tracks_written", "Playlist tracks written"), + ("diagnostic_playlist_tracks_fetch_failed", "Playlist track fetch failures"), + ("diagnostic_playlist_tracks_missing_track_id", "Playlist tracks missing IDs"), + ("diagnostic_playlist_tracks_duplicate_skipped", "Playlist duplicates skipped"), + ], + "No ingestion diagnostics are available.", + ), + "", + "## Artist Affinity Is The Main Taste Signal", + "", + "Top artists are ranked by catalog presence first, then liked-track count. This makes the table useful for deciding whether the library is concentrated around a few artists or spread across many smaller preferences.", + "", + markdown_table( + top_artists, + [ + ("artist_name", "Artist"), + ("track_count", "Tracks"), + ("liked_track_count", "Liked tracks"), + ("playlist_appearances", "Playlist slots"), + ("avg_playlist_appearances_per_track", "Slots per track"), + ], + "No artist rows are available.", + ), + "", + "## Genre Shifts Depend On Metadata Coverage", + "", + "Genre-period rows use only tracks where Yandex Music exposes genre metadata. When genre coverage is sparse, treat this as a directional view rather than a complete listening history.", + "", + markdown_table( + genre_shifts, + [ + ("activity_month", "Month"), + ("genre", "Genre"), + ("event_count", "Events"), + ("active_tracks", "Tracks"), + ("event_share_in_month", "Share"), + ], + "No genre-period rows are available.", + ), + "", + "## Repeats And Underrated Tracks Show Actionable Library Work", + "", + "Repeated tracks are useful for playlist cleanup and taste concentration checks. Underrated tracks are liked tracks with low playlist coverage, which makes them candidates for rediscovery playlists.", + "", + markdown_table( + repeated_tracks, + [ + ("title", "Track"), + ("artist_display", "Artist"), + ("genre", "Genre"), + ("playlist_slots", "Playlist slots"), + ("playlist_count", "Playlists"), + ("repeat_signal", "Repeat signal"), + ], + "No repeated-track signals are available.", + ), + "", + markdown_table( + underrated_tracks, + [ + ("title", "Track"), + ("artist_display", "Artist"), + ("genre", "Genre"), + ("playlist_slots", "Playlist slots"), + ("playlist_count", "Playlists"), + ], + "No underrated-track candidates are available.", + ), + "", + "## Playlist Overlap Highlights Where Curation Can Improve", + "", + "Underrated playlists have high uniqueness and low overlap. They are good candidates for highlighting because they add variety rather than duplicating the same tracks across the library.", + "", + markdown_table( + underrated_playlists, + [ + ("playlist_title", "Playlist"), + ("actual_track_count", "Tracks"), + ("unique_track_count", "Unique tracks"), + ("uniqueness_ratio", "Uniqueness"), + ("max_overlap", "Max overlap"), + ("overlapped_track_mentions", "Overlap mentions"), + ], + "No underrated-playlist candidates are available.", + ), + "", + "## Recommended Next Steps", + "", + "- Use `make dashboard` for interactive filtering after reading this static summary.", + "- Run `make acceptance-real` after adding a real `YANDEX_MUSIC_TOKEN` to refresh the report from account metadata.", + "- Watch `stale_ingestion_flag`; if it is true on a real account, rerun ingestion or inspect whether the API returned timestamped events.", + "", + "## Further Questions", + "", + "- Which genres or languages are underrepresented because the Yandex Music API did not expose metadata?", + "- Which playlist overlaps should be merged, split or archived?", + "- Which underrated tracks should be promoted into a rediscovery playlist?", + "", + "## Caveats And Assumptions", + "", + "- The project stores metadata, events and aggregates only; it does not download or store audio.", + "- Yandex Music integration uses an unofficial Python client, so available fields can vary by account, region and library visibility.", + "- This summary is not a full listening-history analysis unless the account/API returns timestamped history-like metadata.", + ] + + if total_tracks == 0: + lines.extend( + [ + "", + "## No-Data Runbook", + "", + "A zero-track report is still a valid local build check. For real analytics, set `YANDEX_MUSIC_TOKEN` in `.env`, run `make preflight`, then run `make acceptance-real`.", + ] + ) + if stale: + lines.extend( + [ + "", + "## Freshness Warning", + "", + "`stale_ingestion_flag` is true. The latest local event is missing or older than the configured freshness threshold, so use the report for structure validation until ingestion is refreshed.", + ] + ) + + return "\n".join(lines) + "\n" + + +def main() -> int: + duckdb_path = env_path("STREAMIFY_DUCKDB_PATH", DEFAULT_DUCKDB_PATH) + report_path = env_path("STREAMIFY_REPORT_PATH", DEFAULT_REPORT_PATH) + if not duckdb_path.exists(): + raise SystemExit(f"Local DuckDB database is missing: {duckdb_path}. Run make dbt-build first.") + + report_path.parent.mkdir(parents=True, exist_ok=True) + with duckdb.connect(str(duckdb_path), read_only=True) as connection: + report_path.write_text(build_report(connection), encoding="utf-8") + print(f"Wrote Yandex Music self-analytics summary: {report_path}") + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/scripts/run_with_dotenv.py b/scripts/run_with_dotenv.py new file mode 100644 index 0000000..b20c54b --- /dev/null +++ b/scripts/run_with_dotenv.py @@ -0,0 +1,44 @@ +#!/usr/bin/env python3 +from __future__ import annotations + +import argparse +import os +import sys +from pathlib import Path + + +ROOT = Path(__file__).resolve().parents[1] +sys.path.insert(0, str(ROOT)) + +from yamusic_ingest.config import load_dotenv + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser( + description="Load Streamify .env once, then exec a command without Make parsing secrets." + ) + parser.add_argument("--cwd", default=str(ROOT), help="Working directory for the command.") + parser.add_argument("command", nargs=argparse.REMAINDER) + args = parser.parse_args() + if args.command and args.command[0] == "--": + args.command = args.command[1:] + return args + + +def main() -> int: + args = parse_args() + if not args.command: + print("ERROR: command is required after --", file=sys.stderr) + return 2 + + load_dotenv(ROOT / ".env") + cwd = Path(args.cwd) + if not cwd.is_absolute(): + cwd = ROOT / cwd + os.chdir(cwd) + os.execvpe(args.command[0], args.command, os.environ) + return 127 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/scripts/smoke_compose_local.py b/scripts/smoke_compose_local.py new file mode 100644 index 0000000..5829bd1 --- /dev/null +++ b/scripts/smoke_compose_local.py @@ -0,0 +1,110 @@ +#!/usr/bin/env python3 +from __future__ import annotations + +import os +import socket +import subprocess +import sys +import time +from pathlib import Path +from urllib.error import URLError +from urllib.request import urlopen + + +ROOT = Path(__file__).resolve().parents[1] +COMPOSE_FILE = ROOT / "docker-compose.local.yml" + + +def free_port() -> int: + with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock: + sock.bind(("127.0.0.1", 0)) + return int(sock.getsockname()[1]) + + +def run_compose(args: list[str], env: dict[str, str], check: bool = True) -> subprocess.CompletedProcess[str]: + command = ["docker", "compose", "-f", str(COMPOSE_FILE), "--profile", "local", *args] + return subprocess.run(command, cwd=ROOT, env=env, text=True, capture_output=True, check=check) + + +def wait_for_http(url: str, env: dict[str, str], timeout_seconds: int = 120) -> None: + deadline = time.monotonic() + timeout_seconds + last_error: Exception | None = None + while time.monotonic() < deadline: + status = run_compose(["ps", "--format", "json"], env, check=False) + if status.returncode != 0: + last_error = RuntimeError(status.stderr.strip() or status.stdout.strip()) + try: + with urlopen(url, timeout=2) as response: + if response.status == 200: + return + except (URLError, TimeoutError, OSError) as error: + last_error = error + time.sleep(1) + raise RuntimeError(f"compose dashboard did not return HTTP 200 at {url}: {last_error}") + + +def assert_no_runtime_failures(log_output: str) -> None: + failure_markers = [ + "Traceback", + "ModuleNotFoundError", + "Local DuckDB database is missing", + "The local marts are not ready yet", + ] + for marker in failure_markers: + if marker in log_output: + raise RuntimeError(f"compose dashboard emitted failure marker: {marker}") + + +def run_host_check(args: list[str], env: dict[str, str]) -> None: + result = subprocess.run( + [sys.executable, *args], + cwd=ROOT, + env=env, + text=True, + capture_output=True, + check=False, + ) + if result.returncode != 0: + output = result.stdout[-4000:] + result.stderr[-4000:] + raise RuntimeError(f"host validation failed for {' '.join(args)}:\n{output}") + + +def main() -> int: + port = free_port() + url = f"http://127.0.0.1:{port}" + env = os.environ.copy() + env["STREAMIFY_DASHBOARD_PORT"] = str(port) + # Compose smoke must be deterministic and must not call a real account. + env["YANDEX_MUSIC_TOKEN"] = "" + + try: + run_compose(["up", "--build", "-d", "dashboard"], env) + wait_for_http(url, env) + time.sleep(1) + logs = run_compose(["logs", "--no-color", "--tail", "300"], env, check=False) + assert_no_runtime_failures(logs.stdout + logs.stderr) + for check_args in [ + ["scripts/validate_yamusic_raw_contract.py"], + ["scripts/audit_yamusic_readiness.py"], + ["scripts/smoke_product_answers.py"], + ["scripts/smoke_dashboard_content.py"], + ]: + run_host_check(check_args, env) + except (subprocess.CalledProcessError, RuntimeError) as error: + logs = run_compose(["logs", "--no-color", "--tail", "200"], env, check=False) + print(f"ERROR: {error}", file=sys.stderr) + if isinstance(error, subprocess.CalledProcessError): + print(error.stdout[-4000:], file=sys.stderr) + print(error.stderr[-4000:], file=sys.stderr) + print(logs.stdout[-8000:], file=sys.stderr) + print(logs.stderr[-4000:], file=sys.stderr) + return 1 + finally: + run_compose(["down", "--remove-orphans"], env, check=False) + + print(f"OK: docker compose local profile returned HTTP 200 at {url} and produced valid local product artifacts.") + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/scripts/smoke_dashboard.py b/scripts/smoke_dashboard.py new file mode 100644 index 0000000..4330773 --- /dev/null +++ b/scripts/smoke_dashboard.py @@ -0,0 +1,114 @@ +#!/usr/bin/env python3 +from __future__ import annotations + +import os +import shutil +import socket +import subprocess +import sys +import time +from pathlib import Path +from urllib.error import URLError +from urllib.request import urlopen + +ROOT = Path(__file__).resolve().parents[1] +sys.path.insert(0, str(ROOT)) + +from yamusic_ingest.config import load_dotenv + +load_dotenv(ROOT / ".env") +DUCKDB_PATH = ROOT / os.getenv("STREAMIFY_DUCKDB_PATH", "data/streamify.duckdb") + + +def streamlit_executable() -> Path: + candidate = Path(sys.executable).parent / "streamlit" + if candidate.exists(): + return candidate + resolved = shutil.which("streamlit") + if resolved: + return Path(resolved) + raise RuntimeError("streamlit executable was not found") + + +def free_port() -> int: + with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock: + sock.bind(("127.0.0.1", 0)) + return int(sock.getsockname()[1]) + + +def wait_for_http(url: str, process: subprocess.Popen[str], timeout_seconds: int = 20) -> None: + deadline = time.monotonic() + timeout_seconds + last_error: Exception | None = None + while time.monotonic() < deadline: + if process.poll() is not None: + raise RuntimeError(f"dashboard process exited early with code {process.returncode}") + try: + with urlopen(url, timeout=2) as response: + if response.status == 200: + return + except (URLError, TimeoutError, OSError) as error: + last_error = error + time.sleep(0.5) + raise RuntimeError(f"dashboard did not return HTTP 200 at {url}: {last_error}") + + +def main() -> int: + if not DUCKDB_PATH.exists(): + print(f"ERROR: local DuckDB database is missing: {DUCKDB_PATH}", file=sys.stderr) + print("Run `make acceptance-local` before dashboard smoke.", file=sys.stderr) + return 1 + + port = free_port() + url = f"http://127.0.0.1:{port}" + env = os.environ.copy() + env["STREAMIFY_DUCKDB_PATH"] = str(DUCKDB_PATH) + env.setdefault("STREAMLIT_BROWSER_GATHER_USAGE_STATS", "false") + + command = [ + str(streamlit_executable()), + "run", + "dashboard/app.py", + "--server.address=127.0.0.1", + f"--server.port={port}", + "--server.headless=true", + ] + process = subprocess.Popen( + command, + cwd=ROOT, + env=env, + text=True, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + ) + try: + wait_for_http(url, process) + time.sleep(1) + finally: + process.terminate() + try: + output, _ = process.communicate(timeout=5) + except subprocess.TimeoutExpired: + process.kill() + output, _ = process.communicate(timeout=5) + + if process.returncode not in {0, -15, None}: + print(output[-4000:], file=sys.stderr) + return int(process.returncode) + + failure_markers = ["Traceback", "ModuleNotFoundError", "Local DuckDB database is missing"] + for marker in failure_markers: + if marker in output: + print(output[-4000:], file=sys.stderr) + print(f"ERROR: dashboard emitted failure marker: {marker}", file=sys.stderr) + return 1 + + print(f"OK: dashboard returned HTTP 200 at {url}.") + return 0 + + +if __name__ == "__main__": + try: + raise SystemExit(main()) + except RuntimeError as error: + print(f"ERROR: {error}", file=sys.stderr) + raise SystemExit(1) diff --git a/scripts/smoke_dashboard_content.py b/scripts/smoke_dashboard_content.py new file mode 100644 index 0000000..82e7d89 --- /dev/null +++ b/scripts/smoke_dashboard_content.py @@ -0,0 +1,111 @@ +#!/usr/bin/env python3 +from __future__ import annotations + +import os +import sys +from pathlib import Path + +ROOT = Path(__file__).resolve().parents[1] +sys.path.insert(0, str(ROOT)) + +from streamlit.testing.v1 import AppTest + +from yamusic_ingest.config import load_dotenv + +load_dotenv(ROOT / ".env") +DUCKDB_PATH = ROOT / os.getenv("STREAMIFY_DUCKDB_PATH", "data/streamify.duckdb") + + +def fail(message: str) -> None: + raise AssertionError(message) + + +def values(elements: object) -> list[str]: + return [str(getattr(element, "value", "")) for element in elements] + + +def labels(elements: object) -> list[str]: + return [str(getattr(element, "label", "")) for element in elements] + + +def require_contains(actual: list[str], expected: list[str], label: str) -> None: + missing = [value for value in expected if value not in actual] + if missing: + fail(f"dashboard {label} missing expected values: {missing}; actual={actual}") + + +def main() -> int: + if not DUCKDB_PATH.exists(): + fail(f"Local DuckDB database is missing: {DUCKDB_PATH}. Run make acceptance-local first.") + + os.environ["STREAMIFY_DUCKDB_PATH"] = str(DUCKDB_PATH) + app = AppTest.from_file(ROOT / "dashboard" / "app.py", default_timeout=10) + app.run() + + if app.error: + fail(f"dashboard emitted st.error elements: {values(app.error)}") + if app.exception: + fail(f"dashboard emitted st.exception elements: {values(app.exception)}") + + require_contains(values(app.title), ["Streamify Self-Analytics"], "title") + require_contains( + values(app.caption), + ["Local Yandex Music metadata analytics. Audio is not downloaded or stored."], + "caption", + ) + require_contains( + labels(app.metric), + [ + "Tracks", + "Liked", + "Artists", + "Playlists", + "Hours", + "Source", + "Raw tracks", + "Known genres", + "Active months", + "Underrated tracks", + "Underrated playlists", + "Top artist concentration", + ], + "metrics", + ) + require_contains( + labels(app.tabs), + ["Overview", "Periods", "Artists", "Genres", "Playlists", "Tracks", "Actions", "Data Quality"], + "tabs", + ) + require_contains( + values(app.subheader), + [ + "Library snapshot", + "Activity periods", + "Genre shifts", + "Artist affinity", + "Genre diversity", + "Playlist coverage", + "Playlist overlap", + "Repeated and underrated tracks", + "Next actions", + "Rediscovery queue", + "Playlist cleanup candidates", + "Local data quality signals", + ], + "sections", + ) + if len(app.dataframe) < 8: + fail(f"dashboard should expose multiple analytical dataframes, found {len(app.dataframe)}") + if not app.json: + fail("dashboard Data Quality tab should expose a JSON quality block") + + print("OK: dashboard content exposes the expected self-analytics sections.") + return 0 + + +if __name__ == "__main__": + try: + raise SystemExit(main()) + except AssertionError as error: + print(f"ERROR: {error}", file=sys.stderr) + raise SystemExit(1) diff --git a/scripts/smoke_empty_yamusic_dbt.py b/scripts/smoke_empty_yamusic_dbt.py new file mode 100644 index 0000000..84463e1 --- /dev/null +++ b/scripts/smoke_empty_yamusic_dbt.py @@ -0,0 +1,169 @@ +#!/usr/bin/env python3 +from __future__ import annotations + +import json +import os +import shutil +import subprocess +import sys +from hashlib import sha256 +from pathlib import Path + +import duckdb + + +ROOT = Path(__file__).resolve().parents[1] +RAW_DIR = ROOT / "data" / "raw" / "yamusic_empty_smoke" +DUCKDB_PATH = ROOT / "data" / "streamify_empty_smoke.duckdb" +DATASETS = [ + "tracks", + "artists", + "albums", + "playlists", + "playlist_tracks", + "user_library_events", +] + + +def file_sha256(path: Path) -> str: + digest = sha256() + with path.open("rb") as file: + for chunk in iter(lambda: file.read(1024 * 1024), b""): + digest.update(chunk) + return digest.hexdigest() + + +def cleanup() -> None: + shutil.rmtree(RAW_DIR, ignore_errors=True) + for path in [DUCKDB_PATH, DUCKDB_PATH.with_suffix(".duckdb.wal")]: + path.unlink(missing_ok=True) + + +def dbt_command() -> list[str]: + candidate = Path(sys.executable).resolve().parent / "dbt" + if candidate.exists(): + return [str(candidate)] + resolved = shutil.which("dbt") + if resolved: + return [resolved] + module_probe = subprocess.run( + [sys.executable, "-m", "dbt.cli.main", "--version"], + text=True, + capture_output=True, + check=False, + ) + if module_probe.returncode == 0: + return [sys.executable, "-m", "dbt.cli.main"] + raise RuntimeError("dbt executable was not found") + + +def main() -> int: + cleanup() + try: + RAW_DIR.mkdir(parents=True, exist_ok=True) + for dataset in DATASETS: + (RAW_DIR / f"{dataset}.jsonl").write_text("", encoding="utf-8") + manifest = { + "generated_at": "2026-01-01T00:00:00+00:00", + "source": "yandex_music", + "raw_dir": str(RAW_DIR), + "json_only": True, + "adapter": { + "adapter_name": "yamusic_ingest", + "adapter_version": "0.1.0", + "client_library": "yandex-music", + "client_library_version": None, + }, + "diagnostics": { + "liked_shortcuts_seen": 0, + "liked_tracks_written": 0, + "liked_shortcuts_fetch_failed": 0, + "liked_shortcuts_missing_track_id": 0, + "liked_tracks_duplicate_skipped": 0, + "liked_albums_seen": 0, + "liked_albums_written": 0, + "liked_albums_missing_id": 0, + "liked_albums_duplicate_skipped": 0, + "liked_artists_seen": 0, + "liked_artists_written": 0, + "liked_artists_missing_id": 0, + "liked_artists_duplicate_skipped": 0, + "liked_playlists_seen": 0, + "liked_playlists_written": 0, + "liked_playlists_missing_id": 0, + "liked_playlists_duplicate_skipped": 0, + "playlists_seen": 0, + "playlists_written": 0, + "playlists_missing_id": 0, + "playlist_fetch_fallbacks": 0, + "playlist_tracks_seen": 0, + "playlist_tracks_written": 0, + "playlist_tracks_fetch_failed": 0, + "playlist_tracks_missing_track_id": 0, + "playlist_tracks_duplicate_skipped": 0, + }, + "datasets": { + dataset: { + "jsonl_path": str(RAW_DIR / f"{dataset}.jsonl"), + "row_count": 0, + "jsonl_sha256": file_sha256(RAW_DIR / f"{dataset}.jsonl"), + "parquet_written": False, + } + for dataset in DATASETS + }, + } + (RAW_DIR / "_manifest.json").write_text(json.dumps(manifest, indent=2, sort_keys=True) + "\n", encoding="utf-8") + + env = os.environ.copy() + env["GCP_PROJECT_ID"] = "dummy" + env["STREAMIFY_RAW_DIR"] = "data/raw/yamusic_empty_smoke" + env["STREAMIFY_DUCKDB_PATH"] = "data/streamify_empty_smoke.duckdb" + + deps_command = [ + *dbt_command(), + "deps", + ] + deps_result = subprocess.run(deps_command, cwd=ROOT / "dbt", env=env, text=True, capture_output=True, check=False) + if deps_result.returncode != 0: + print(deps_result.stdout[-4000:], file=sys.stderr) + print(deps_result.stderr[-4000:], file=sys.stderr) + return deps_result.returncode + + command = [ + *dbt_command(), + "build", + "--profiles-dir", + ".", + "--target", + "local", + "--select", + "yamusic", + "--no-partial-parse", + ] + result = subprocess.run(command, cwd=ROOT / "dbt", env=env, text=True, capture_output=True, check=False) + + if result.returncode != 0: + print(result.stdout[-4000:], file=sys.stderr) + print(result.stderr[-4000:], file=sys.stderr) + return result.returncode + + with duckdb.connect(str(DUCKDB_PATH), read_only=True) as conn: + profile = conn.execute( + """ + select total_tracks, liked_tracks, playlists, stale_ingestion_flag + from yamusic_library_profile + """ + ).fetchone() + finally: + cleanup() + + if profile != (0, 0, 0, 1): + print(f"ERROR: unexpected empty-profile values: {profile!r}", file=sys.stderr) + return 1 + + print("OK: empty Yandex Music raw datasets build with local dbt target.") + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/scripts/smoke_product_answers.py b/scripts/smoke_product_answers.py new file mode 100644 index 0000000..0674411 --- /dev/null +++ b/scripts/smoke_product_answers.py @@ -0,0 +1,228 @@ +#!/usr/bin/env python3 +from __future__ import annotations + +import os +import sys +import json +import csv +from pathlib import Path +from typing import Any + +import duckdb + +ROOT = Path(__file__).resolve().parents[1] +sys.path.insert(0, str(ROOT)) + +from yamusic_ingest.config import load_dotenv + +load_dotenv(ROOT / ".env") +DUCKDB_PATH = ROOT / os.getenv("STREAMIFY_DUCKDB_PATH", "data/streamify.duckdb") +REPORT_PATH = ROOT / os.getenv("STREAMIFY_REPORT_PATH", "data/streamify_summary.md") +SNAPSHOT_PATH = ROOT / os.getenv("STREAMIFY_SNAPSHOT_PATH", "data/streamify_snapshot.json") +RECOMMENDATIONS_DIR = ROOT / os.getenv("STREAMIFY_RECOMMENDATIONS_DIR", "data/recommendations") + + +def fail(message: str) -> None: + raise AssertionError(message) + + +def scalar(conn: duckdb.DuckDBPyConnection, sql: str) -> Any: + return conn.execute(sql).fetchone()[0] + + +def require_count(conn: duckdb.DuckDBPyConnection, label: str, sql: str) -> None: + count = int(scalar(conn, sql) or 0) + if count <= 0: + fail(f"Missing product answer coverage for {label}") + + +def check_duckdb_answers() -> None: + if not DUCKDB_PATH.exists(): + fail(f"Missing local DuckDB database: {DUCKDB_PATH}") + with duckdb.connect(str(DUCKDB_PATH), read_only=True) as conn: + total_tracks = int(scalar(conn, "select total_tracks from yamusic_library_profile") or 0) + if total_tracks <= 0: + fail("Product-answer smoke requires non-empty sample or account metadata") + + require_count(conn, "favorite artists", "select count(*) from yamusic_artist_affinity where track_count > 0") + require_count(conn, "favorite tracks", "select count(*) from yamusic_dim_tracks where title is not null") + require_count(conn, "repeat signals", "select count(*) from yamusic_track_signals where repeat_signal > 1") + require_count(conn, "genre shifts", "select count(*) from yamusic_genre_periods where event_share_in_month > 0") + require_count(conn, "diversity profile", "select count(*) from yamusic_genre_profile where track_share > 0") + require_count(conn, "active periods", "select count(*) from yamusic_period_activity where event_count > 0") + require_count(conn, "playlist overlap", "select count(*) from yamusic_playlist_overlap where overlap_track_count > 0") + require_count(conn, "playlist signals", "select count(*) from yamusic_playlist_signals where actual_track_count > 0") + require_count(conn, "underrated tracks", "select count(*) from yamusic_track_signals where underrated_flag in (0, 1)") + + quality = conn.execute( + """ + select + manifest_source, + adapter_name, + adapter_version, + client_library, + diagnostic_liked_shortcuts_seen, + diagnostic_liked_tracks_written, + diagnostic_playlist_tracks_seen, + diagnostic_playlist_tracks_written, + raw_tracks, + stale_ingestion_flag, + top_artist_concentration, + known_genres, + max_repeat_signal + from yamusic_library_profile + """ + ).fetchone() + ( + manifest_source, + adapter_name, + adapter_version, + client_library, + diagnostic_liked_shortcuts_seen, + diagnostic_liked_tracks_written, + diagnostic_playlist_tracks_seen, + diagnostic_playlist_tracks_written, + raw_tracks, + stale_ingestion_flag, + top_artist_concentration, + known_genres, + max_repeat_signal, + ) = quality + if manifest_source not in {"sample", "yandex_music"}: + fail("Data provenance answer must expose manifest_source as sample or yandex_music") + if not adapter_name or not adapter_version or not client_library: + fail("Data provenance answer must expose ingestion adapter and client library metadata") + for value in [ + diagnostic_liked_shortcuts_seen, + diagnostic_liked_tracks_written, + diagnostic_playlist_tracks_seen, + diagnostic_playlist_tracks_written, + ]: + if value is None or int(value) < 0: + fail("Data quality answer must expose non-negative ingestion diagnostics") + if int(raw_tracks or 0) != total_tracks: + fail("Data provenance answer must expose raw_tracks aligned to the current profile") + if stale_ingestion_flag not in {0, 1}: + fail("Data quality answer must expose a boolean stale_ingestion_flag") + if top_artist_concentration is None or not (0 <= float(top_artist_concentration) <= 1): + fail("Diversity answer must expose top_artist_concentration in [0, 1]") + if int(known_genres or 0) <= 0: + fail("Diversity answer must expose at least one known genre for sample data") + if int(max_repeat_signal or 0) <= 0: + fail("Repeat answer must expose max_repeat_signal") + + +def check_report_answers() -> None: + if not REPORT_PATH.exists(): + fail(f"Missing static self-analytics report: {REPORT_PATH}") + text = REPORT_PATH.read_text(encoding="utf-8") + for marker in [ + "Artist Affinity Is The Main Taste Signal", + "Genre Shifts Depend On Metadata Coverage", + "Repeats And Underrated Tracks Show Actionable Library Work", + "Playlist Overlap Highlights Where Curation Can Improve", + "Raw Ingestion Counts", + "Raw File Checksums", + "Stale", + ]: + if marker not in text: + fail(f"Static report must contain product answer section {marker!r}") + + +def check_snapshot_answers() -> None: + if not SNAPSHOT_PATH.exists(): + fail(f"Missing JSON self-analytics snapshot: {SNAPSHOT_PATH}") + snapshot = json.loads(SNAPSHOT_PATH.read_text(encoding="utf-8")) + if snapshot.get("schema_version") != "1.0": + fail("JSON snapshot must expose schema_version=1.0") + if snapshot.get("product") != "Streamify Yandex Music Self-Analytics": + fail("JSON snapshot must expose the product name") + if snapshot.get("source") not in {"sample", "yandex_music"}: + fail("JSON snapshot must expose source as sample or yandex_music") + if not isinstance(snapshot.get("profile"), dict): + fail("JSON snapshot must expose a profile object") + quality = snapshot.get("quality") + if not isinstance(quality, dict): + fail("JSON snapshot must expose a quality object") + for key in ["adapter", "raw_counts", "raw_checksums", "ingestion_diagnostics"]: + if key not in quality: + fail(f"JSON snapshot quality must expose {key}") + for key, value in quality["raw_checksums"].items(): + if not isinstance(value, str) or len(value) != 64: + fail(f"JSON snapshot raw checksum {key} must be a 64-character sha256 digest") + answers = snapshot.get("answers") + if not isinstance(answers, dict): + fail("JSON snapshot must expose an answers object") + for key in [ + "favorite_artists", + "favorite_tracks", + "genre_profile", + "genre_shifts", + "active_periods", + "repeat_tracks", + "underrated_tracks", + "playlist_overlap", + "underrated_playlists", + ]: + if key not in answers: + fail(f"JSON snapshot answers must expose {key}") + if not isinstance(answers[key], list): + fail(f"JSON snapshot answer {key} must be a list") + if not answers["favorite_artists"] or not answers["favorite_tracks"]: + fail("JSON snapshot must include favorite artist and track rows for sample/account metadata") + diagnostics = quality["ingestion_diagnostics"] + for key in [ + "liked_shortcuts_seen", + "liked_tracks_duplicate_skipped", + "liked_albums_seen", + "liked_albums_written", + "liked_artists_seen", + "liked_artists_written", + "liked_playlists_seen", + "liked_playlists_written", + "playlist_tracks_seen", + "playlist_tracks_fetch_failed", + "playlist_tracks_duplicate_skipped", + ]: + value = diagnostics.get(key) + if value is None or int(value) < 0: + fail(f"JSON snapshot diagnostics must expose non-negative {key}") + + +def check_recommendation_exports() -> None: + expected_files = { + "top_artists.csv", + "rediscovery_tracks.csv", + "playlist_cleanup.csv", + "standout_playlists.csv", + "genre_shifts.csv", + } + if not RECOMMENDATIONS_DIR.exists(): + fail(f"Missing recommendations export directory: {RECOMMENDATIONS_DIR}") + for file_name in sorted(expected_files): + path = RECOMMENDATIONS_DIR / file_name + if not path.exists(): + fail(f"Missing recommendation export: {path}") + with path.open(encoding="utf-8", newline="") as file: + rows = list(csv.reader(file)) + if not rows or not rows[0]: + fail(f"Recommendation export {file_name} must contain a header row") + if file_name in {"top_artists.csv", "rediscovery_tracks.csv", "playlist_cleanup.csv", "genre_shifts.csv"} and len(rows) <= 1: + fail(f"Recommendation export {file_name} must contain sample/account rows") + + +def main() -> int: + check_duckdb_answers() + check_report_answers() + check_snapshot_answers() + check_recommendation_exports() + print("OK: practical self-analytics product answers and Data Quality signals are available.") + return 0 + + +if __name__ == "__main__": + try: + raise SystemExit(main()) + except (AssertionError, duckdb.Error) as error: + print(f"ERROR: {error}", file=sys.stderr) + raise SystemExit(1) diff --git a/scripts/smoke_real_gate.py b/scripts/smoke_real_gate.py new file mode 100644 index 0000000..c0b2bea --- /dev/null +++ b/scripts/smoke_real_gate.py @@ -0,0 +1,39 @@ +#!/usr/bin/env python3 +from __future__ import annotations + +import subprocess +import sys +from pathlib import Path + + +ROOT = Path(__file__).resolve().parents[1] + + +def main() -> int: + command = [ + sys.executable, + "scripts/audit_yamusic_readiness.py", + "--require-real", + ] + result = subprocess.run(command, cwd=ROOT, text=True, capture_output=True, check=False) + output = result.stdout + result.stderr + if result.returncode == 0: + print("ERROR: sample metadata unexpectedly passed real-account readiness.", file=sys.stderr) + print(output[-4000:], file=sys.stderr) + return 1 + required = [ + "Real-account readiness requires", + "source=yandex_music", + "YANDEX_MUSIC_TOKEN", + ] + missing = [marker for marker in required if marker not in output] + if missing: + print(f"ERROR: real-account gate failure message is missing markers: {missing}", file=sys.stderr) + print(output[-4000:], file=sys.stderr) + return 1 + print("OK: sample metadata is rejected by the real-account readiness gate.") + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/scripts/validate_yamusic_local.py b/scripts/validate_yamusic_local.py new file mode 100644 index 0000000..342dd41 --- /dev/null +++ b/scripts/validate_yamusic_local.py @@ -0,0 +1,137 @@ +#!/usr/bin/env python3 +from __future__ import annotations + +import sys +from pathlib import Path + + +ROOT = Path(__file__).resolve().parents[1] + + +def read(path: str) -> str: + file_path = ROOT / path + if not file_path.exists(): + raise AssertionError(f"Missing required file: {path}") + return file_path.read_text(encoding="utf-8") + + +def require_markers(path: str, markers: list[str]) -> None: + text = read(path) + for marker in markers: + if marker not in text: + raise AssertionError(f"{path} must contain {marker!r}") + + +def reject_markers(path: str, markers: list[str]) -> None: + text = read(path) + for marker in markers: + if marker in text: + raise AssertionError(f"{path} must not contain {marker!r}") + + +def main() -> int: + for path in [ + "yamusic_ingest/__main__.py", + "yamusic_ingest/yandex_client.py", + "dashboard/app.py", + "dashboard/actions.py", + "docker-compose.local.yml", + "Makefile", + ".github/workflows/data-quality.yml", + ".env.example", + "docs/yamusic_lineage.md", + "docs/product_acceptance.md", + "dbt/models/yamusic/schema.yml", + "scripts/check_no_local_sensitive_artifacts.py", + "scripts/check_no_audio_artifacts.py", + "scripts/validate_yamusic_raw_contract.py", + "scripts/smoke_empty_yamusic_dbt.py", + "scripts/smoke_dashboard.py", + "scripts/smoke_dashboard_content.py", + "scripts/smoke_compose_local.py", + "scripts/smoke_real_gate.py", + "scripts/smoke_product_answers.py", + "scripts/run_with_dotenv.py", + "scripts/doctor_yamusic_local.py", + "scripts/export_yamusic_summary.py", + "scripts/export_yamusic_snapshot.py", + "scripts/export_yamusic_recommendations.py", + "scripts/audit_yamusic_readiness.py", + "scripts/build_pages_site.py", + ".github/workflows/pages.yml", + ".github/workflows/release.yml", + ".github/ISSUE_TEMPLATE/agent_task.yml", + ".github/ISSUE_TEMPLATE/data_quality.yml", + ".github/ISSUE_TEMPLATE/product_request.yml", + ".github/PULL_REQUEST_TEMPLATE.md", + "docs/project_management.md", + "docs/release_process.md", + "docs/releases/v0.1.0.md", + ]: + if not (ROOT / path).exists(): + raise AssertionError(f"Missing required local product file: {path}") + + require_markers( + "README.md", + ["Yandex Music", "DuckDB", "make help", "make status", "make ingest-sample", "make acceptance-real", "make dashboard", "genre shifts", "`local` profile", "DBT_THREADS=1", "scripts/run_with_dotenv.py", "fresh checkout", "make clean-local", "dbt target/logs/packages", "make readiness-real", "make up-local", "make snapshot", "make recommendations", "make pages-site", "GitHub Pages", "tag-based releases", "streamify_snapshot.json", "data/recommendations"], + ) + require_markers( + "docs/yandex_music_local.md", + ["YANDEX_MUSIC_TOKEN", "make acceptance-real", "make status", "make token-help", "bounded retries", "dbt build --profiles-dir . --target local", "No audio", "underrated tracks", "Real Account Acceptance Check", "Empty/private accounts", "scripts/run_with_dotenv.py", "make dbt-build", "make up-local", "streamify_empty_smoke", "--require-real", "stale Parquet cleanup", "JSONL sha256 checksums", "ingestion diagnostics", "ingestion diagnostics consistency", "STREAMIFY_SNAPSHOT_PATH", "STREAMIFY_RECOMMENDATIONS_DIR", "streamify_snapshot.json", "data/recommendations", "latest manifest source", "Actions tab"], + ) + require_markers( + "docs/yamusic_lineage.md", + ["Raw/Bronze", "Silver", "Gold", "liked albums", "liked artists", "liked playlists", "stg_yamusic_manifest", "adapter/client metadata", "diagnostics counters", "JSONL sha256 checksums", "ingestion diagnostics consistency", "yamusic_genre_periods", "Product Questions", "Quality Gates", "make acceptance-real", "referential integrity", "Snapshot export", "JSON snapshot", "Recommendations export"], + ) + require_markers( + "docs/product_acceptance.md", + ["Requirement Matrix", "make acceptance-local", "make test", "make acceptance-real", "real_account_verified", "No audio", "Yandex Music metadata ingestion", "make readiness-real", "make product-answers-smoke", "stale Parquet cleanup", "Source provenance", "data/streamify_snapshot.json", "make snapshot", "data/recommendations/*.csv", "make recommendations", "dashboard Actions tab"], + ) + require_markers("dbt/profiles.yml", ["type: duckdb", "target: dev", "DBT_THREADS"]) + require_markers(".env.example", ["YANDEX_MUSIC_TOKEN=", "STREAMIFY_REPORT_PATH", "STREAMIFY_SNAPSHOT_PATH", "STREAMIFY_RECOMMENDATIONS_DIR", "DBT_THREADS=1"]) + require_markers( + "dbt/models/yamusic/schema.yml", + ["stg_yamusic_tracks", "stg_yamusic_manifest", "manifest_source", "adapter_name", "client_library", "yamusic_artist_affinity", "yamusic_library_profile", "yamusic_period_activity", "yamusic_genre_periods", "yamusic_track_signals", "yamusic_playlist_signals", "stale_ingestion_flag", "diagnostic_liked_shortcuts_fetch_failed", "diagnostic_liked_tracks_duplicate_skipped", "diagnostic_liked_albums_seen", "diagnostic_liked_artists_seen", "diagnostic_liked_playlists_seen", "diagnostic_playlist_tracks_fetch_failed", "diagnostic_playlist_tracks_missing_track_id", "diagnostic_playlist_tracks_duplicate_skipped", "raw_tracks_sha256", "raw_user_library_events_sha256"], + ) + require_markers("dashboard/app.py", ["Local DuckDB database is missing", "Streamify Self-Analytics", "Periods", "Genre diversity", "Genre shifts", "Actions", "Next actions", "Rediscovery queue", "Playlist cleanup candidates", "Download snapshot", "Download action queues", "RECOMMENDATIONS_DIR", "No Yandex Music library metadata was returned", "manifest_source", "adapter_name", "raw_counts", "raw_checksums", "ingestion_diagnostics", "build_data_next_actions", "apply_track_filters", "st.sidebar.multiselect", "st.sidebar.selectbox", "st.sidebar.text_input"]) + require_markers("dashboard/actions.py", ["build_data_next_actions", "YANDEX_MUSIC_TOKEN", "stale_ingestion_flag", "liked shortcuts failed", "playlist shortcuts failed", "Data is ready for exploration"]) + require_markers("docker-compose.local.yml", ['profiles: ["local"]', "YANDEX_MUSIC_TOKEN", "service_completed_successfully", "DBT_THREADS", "set -euo pipefail", "READINESS_ARGS", "--require-real", "validate_yamusic_raw_contract.py", "doctor_yamusic_local.py", "export_yamusic_summary.py", "export_yamusic_snapshot.py", "export_yamusic_recommendations.py", "audit_yamusic_readiness.py"]) + require_markers("Makefile", ["help:", "token-help:", "pages-site:", "Streamify local Yandex Music self-analytics", "scripts/run_with_dotenv.py", "$(ENV_RUN) -- docker compose -f docker-compose.local.yml --profile local up --build", "$(ENV_RUN) -- docker compose -f docker-compose.local.yml --profile local config --quiet", "dbt-build: dbt-deps", "status", "preflight", "dashboard-smoke", "compose-smoke-local", "acceptance-real", "raw-contract", "report", "snapshot", "recommendations", "readiness", "readiness-real", "real-gate-smoke", "product-answers-smoke", "check_no_local_sensitive_artifacts.py", "check_no_audio_artifacts.py", "smoke_empty_yamusic_dbt.py", "smoke_real_gate.py", "smoke_product_answers.py", "smoke_dashboard_content.py", "acceptance-local", "doctor_yamusic_local.py", "streamify_empty", "dbt/dbt_packages", "streamify_snapshot.json", "data/recommendations", "build_pages_site.py"]) + reject_markers("Makefile", ["include .env"]) + require_markers("scripts/run_with_dotenv.py", ["load_dotenv", "os.execvpe", "--cwd", "Make parsing secrets"]) + require_markers(".github/workflows/data-quality.yml", ["make test", "YANDEX_MUSIC_TOKEN", "DBT_THREADS"]) + require_markers(".github/workflows/pages.yml", ["GitHub Pages", "make acceptance-local", "build_pages_site.py", "YANDEX_MUSIC_TOKEN: \"\"", "actions/deploy-pages"]) + require_markers(".github/workflows/release.yml", ["Release", "tags:", "make test", "git archive", "gh release create"]) + require_markers(".github/ISSUE_TEMPLATE/agent_task.yml", ["Agent lane", "Repo/Build", "Yandex Ingestion", "Analytics/dbt", "Product/Dashboard", "QA/Integration"]) + require_markers(".github/PULL_REQUEST_TEMPLATE.md", ["Product Value", "Data Engineering Impact", "make test", "make acceptance-real"]) + require_markers("docs/project_management.md", ["Agent Lanes", "Repo/Build", "Yandex Ingestion", "QA/Integration", "v0.1.0-local-mvp"]) + require_markers("docs/release_process.md", ["Release Checklist", "GitHub Pages", "sample metadata", "git tag vX.Y.Z"]) + require_markers("scripts/build_pages_site.py", ["PUBLIC_DIR", "Sample Summary", "streamify_summary.md", "index.html"]) + require_markers("scripts/check_no_local_sensitive_artifacts.py", ["FORBIDDEN_TRACKED_PATHS", "data/raw/yamusic", "DuckDB files", "audio artifacts are tracked"]) + require_markers("scripts/check_no_audio_artifacts.py", ["AUDIO_EXTENSIONS", "must not store audio files"]) + require_markers("scripts/validate_yamusic_raw_contract.py", ["SCHEMAS", "DIAGNOSTIC_FIELDS", "validate_diagnostic_consistency", "jsonl_sha256", "sha256 mismatch", "playlist_tracks_written", "playlist_tracks_fetch_failed", "liked_tracks_duplicate_skipped", "liked_playlists_written", "playlist_tracks_duplicate_skipped", "liked shortcut diagnostics must add up", "Yandex Music raw schema contract is valid", "user_library_events", "adapter_name", "client_library"]) + require_markers("scripts/smoke_empty_yamusic_dbt.py", ["yamusic_empty_smoke", "--no-partial-parse", "dbt.cli.main", "deps_command", "empty Yandex Music raw datasets", "stale_ingestion_flag", "jsonl_sha256"]) + require_markers("scripts/smoke_dashboard.py", ["dashboard returned HTTP 200", "STREAMIFY_DUCKDB_PATH", "server.headless=true"]) + require_markers("scripts/smoke_dashboard_content.py", ["AppTest", "Streamify Self-Analytics", "Data Quality", "Local data quality signals", "dashboard content exposes the expected self-analytics sections"]) + require_markers("scripts/smoke_compose_local.py", ["docker compose local profile returned HTTP 200", "produced valid local product artifacts", "YANDEX_MUSIC_TOKEN", "wait_for_http", "assert_no_runtime_failures", "run_host_check", "validate_yamusic_raw_contract.py", "smoke_product_answers.py", "smoke_dashboard_content.py", "ModuleNotFoundError"]) + require_markers("scripts/smoke_real_gate.py", ["sample metadata is rejected", "--require-real", "source=yandex_music", "YANDEX_MUSIC_TOKEN"]) + require_markers("scripts/smoke_product_answers.py", ["favorite artists", "repeat signals", "genre shifts", "playlist overlap", "Data Quality", "manifest_source", "adapter_name", "Raw Ingestion Counts", "Raw File Checksums", "raw_checksums", "diagnostic_liked_shortcuts_seen", "JSON snapshot", "streamify_snapshot.json", "recommendations export", "rediscovery_tracks.csv"]) + require_markers("scripts/doctor_yamusic_local.py", ["_manifest.json", "stg_yamusic_manifest", "adapter metadata", "yamusic_genre_periods", "raw counts", "local Yandex Music acceptance checks passed"]) + require_markers("scripts/export_yamusic_summary.py", ["Streamify Yandex Music Self-Analytics Summary", "STREAMIFY_REPORT_PATH", "yamusic_artist_affinity", "yamusic_playlist_signals", "Raw Ingestion Counts", "Raw File Checksums", "Ingestion Diagnostics", "Adapter version"]) + require_markers("scripts/export_yamusic_snapshot.py", ["Streamify Yandex Music Self-Analytics", "STREAMIFY_SNAPSHOT_PATH", "schema_version", "favorite_artists", "playlist_overlap", "raw_checksums", "ingestion_diagnostics"]) + require_markers("scripts/export_yamusic_recommendations.py", ["STREAMIFY_RECOMMENDATIONS_DIR", "rediscovery_tracks.csv", "playlist_cleanup.csv", "standout_playlists.csv", "genre_shifts.csv"]) + require_markers("scripts/audit_yamusic_readiness.py", ["real_account_verified", "local product readiness", "Audio files must not be stored", "yamusic_library_profile", "manifest_source", "adapter_name", "raw_checksums_from_profile", "ingestion_diagnostics", "snapshot_path", "recommendations_dir", "--require-real", "source=yandex_music"]) + require_markers("yamusic_ingest/config.py", ["def load_dotenv", "export ", "os.environ.setdefault"]) + require_markers("yamusic_ingest/yandex_client.py", ["IngestResult", "_call_with_retries", "sleep", "failed after", "client.users_likes_tracks", "client.users_likes_playlists", "shortcut.fetch_track"]) + require_markers("yamusic_ingest/__main__.py", ["DIAGNOSTIC_FIELDS", "liked_albums_seen", "liked_artists_seen", "liked_playlists_seen", "jsonl_sha256", "--status", "token_configured", "last_source", "manifest_read_error", "snapshot_exists", "recommendations_exists", "next_step", "make preflight", "make acceptance-real", "client_metadata"]) + require_markers("yamusic_ingest/io.py", ["file_sha256", "remove_file_if_exists", "write_parquet_if_available"]) + print("OK: local Yandex Music product contract is aligned.") + return 0 + + +if __name__ == "__main__": + try: + raise SystemExit(main()) + except AssertionError as error: + print(f"ERROR: {error}", file=sys.stderr) + raise SystemExit(1) diff --git a/scripts/validate_yamusic_raw_contract.py b/scripts/validate_yamusic_raw_contract.py new file mode 100644 index 0000000..87733d1 --- /dev/null +++ b/scripts/validate_yamusic_raw_contract.py @@ -0,0 +1,314 @@ +#!/usr/bin/env python3 +from __future__ import annotations + +import json +import os +import sys +from hashlib import sha256 +from pathlib import Path +from typing import Any + +ROOT = Path(__file__).resolve().parents[1] +sys.path.insert(0, str(ROOT)) + +from yamusic_ingest.config import load_dotenv + +load_dotenv(ROOT / ".env") +RAW_DIR = ROOT / os.getenv("STREAMIFY_RAW_DIR", "data/raw/yamusic") +SOURCES = {"sample", "yandex_music"} +EVENT_TYPES = {"liked_track", "playlist_membership"} +DIAGNOSTIC_FIELDS = [ + "liked_shortcuts_seen", + "liked_tracks_written", + "liked_shortcuts_fetch_failed", + "liked_shortcuts_missing_track_id", + "liked_tracks_duplicate_skipped", + "liked_albums_seen", + "liked_albums_written", + "liked_albums_missing_id", + "liked_albums_duplicate_skipped", + "liked_artists_seen", + "liked_artists_written", + "liked_artists_missing_id", + "liked_artists_duplicate_skipped", + "liked_playlists_seen", + "liked_playlists_written", + "liked_playlists_missing_id", + "liked_playlists_duplicate_skipped", + "playlists_seen", + "playlists_written", + "playlists_missing_id", + "playlist_fetch_fallbacks", + "playlist_tracks_seen", + "playlist_tracks_written", + "playlist_tracks_fetch_failed", + "playlist_tracks_missing_track_id", + "playlist_tracks_duplicate_skipped", +] + +SCHEMAS: dict[str, dict[str, tuple[type, ...]]] = { + "tracks": { + "track_id": (str,), + "title": (str,), + "artist_ids": (list,), + "artist_names": (list,), + "liked": (bool,), + "source": (str,), + "ingested_at": (str,), + }, + "artists": { + "artist_id": (str,), + "artist_name": (str,), + "source": (str,), + "ingested_at": (str,), + }, + "albums": { + "album_id": (str,), + "source": (str,), + "ingested_at": (str,), + }, + "playlists": { + "playlist_id": (str,), + "playlist_title": (str,), + "source": (str,), + "ingested_at": (str,), + }, + "playlist_tracks": { + "playlist_id": (str,), + "track_id": (str,), + "position": (int,), + "source": (str,), + "ingested_at": (str,), + }, + "user_library_events": { + "event_id": (str,), + "event_type": (str,), + "track_id": (str,), + "event_ts": (str,), + "source": (str,), + "ingested_at": (str,), + }, +} + + +def fail(message: str) -> None: + raise AssertionError(message) + + +def rows_for(dataset: str) -> list[dict[str, Any]]: + path = RAW_DIR / f"{dataset}.jsonl" + if not path.exists(): + fail(f"Missing raw dataset: {path}") + rows: list[dict[str, Any]] = [] + with path.open(encoding="utf-8") as file: + for line_number, line in enumerate(file, start=1): + if not line.strip(): + continue + value = json.loads(line) + if not isinstance(value, dict): + fail(f"{dataset}.jsonl:{line_number} must contain a JSON object") + rows.append(value) + return rows + + +def file_sha256(path: Path) -> str: + digest = sha256() + with path.open("rb") as file: + for chunk in iter(lambda: file.read(1024 * 1024), b""): + digest.update(chunk) + return digest.hexdigest() + + +def validate_row(dataset: str, row: dict[str, Any], row_number: int) -> None: + for field, expected_types in SCHEMAS[dataset].items(): + if field not in row: + fail(f"{dataset}.jsonl:{row_number} missing required field {field!r}") + value = row[field] + if value is None: + fail(f"{dataset}.jsonl:{row_number} required field {field!r} must not be null") + if not isinstance(value, expected_types): + type_names = ", ".join(expected_type.__name__ for expected_type in expected_types) + fail(f"{dataset}.jsonl:{row_number} field {field!r} must be {type_names}, got {type(value).__name__}") + + if row.get("source") not in SOURCES: + fail(f"{dataset}.jsonl:{row_number} source must be one of {sorted(SOURCES)}") + if dataset == "user_library_events" and row.get("event_type") not in EVENT_TYPES: + fail(f"{dataset}.jsonl:{row_number} event_type must be one of {sorted(EVENT_TYPES)}") + + +def require_non_empty_id(dataset: str, row: dict[str, Any], row_number: int, field: str) -> None: + value = row.get(field) + if isinstance(value, str) and not value.strip(): + fail(f"{dataset}.jsonl:{row_number} field {field!r} must not be empty") + + +def require_unique(rows: list[dict[str, Any]], dataset: str, fields: tuple[str, ...]) -> None: + seen: dict[tuple[Any, ...], int] = {} + for row_number, row in enumerate(rows, start=1): + key = tuple(row.get(field) for field in fields) + if key in seen: + field_label = ", ".join(fields) + fail(f"{dataset}.jsonl:{row_number} duplicate {field_label} {key!r}; first seen at row {seen[key]}") + seen[key] = row_number + + +def validate_integrity(rows_by_dataset: dict[str, list[dict[str, Any]]]) -> None: + for dataset, fields in { + "tracks": ("track_id",), + "artists": ("artist_id",), + "albums": ("album_id",), + "playlists": ("playlist_id",), + "user_library_events": ("event_id",), + }.items(): + for row_number, row in enumerate(rows_by_dataset[dataset], start=1): + require_non_empty_id(dataset, row, row_number, fields[0]) + require_unique(rows_by_dataset[dataset], dataset, fields) + + track_ids = {row["track_id"] for row in rows_by_dataset["tracks"]} + playlist_ids = {row["playlist_id"] for row in rows_by_dataset["playlists"]} + + playlist_tracks = rows_by_dataset["playlist_tracks"] + require_unique(playlist_tracks, "playlist_tracks", ("playlist_id", "track_id")) + require_unique(playlist_tracks, "playlist_tracks", ("playlist_id", "position")) + for row_number, row in enumerate(playlist_tracks, start=1): + require_non_empty_id("playlist_tracks", row, row_number, "playlist_id") + require_non_empty_id("playlist_tracks", row, row_number, "track_id") + if row["playlist_id"] not in playlist_ids: + fail(f"playlist_tracks.jsonl:{row_number} playlist_id {row['playlist_id']!r} is not present in playlists.jsonl") + if row["track_id"] not in track_ids: + fail(f"playlist_tracks.jsonl:{row_number} track_id {row['track_id']!r} is not present in tracks.jsonl") + + for row_number, row in enumerate(rows_by_dataset["user_library_events"], start=1): + require_non_empty_id("user_library_events", row, row_number, "event_id") + require_non_empty_id("user_library_events", row, row_number, "track_id") + if row["track_id"] not in track_ids: + fail(f"user_library_events.jsonl:{row_number} track_id {row['track_id']!r} is not present in tracks.jsonl") + if row["event_type"] == "playlist_membership": + playlist_id = row.get("playlist_id") + if not isinstance(playlist_id, str) or not playlist_id.strip(): + fail(f"user_library_events.jsonl:{row_number} playlist_membership event must include playlist_id") + if playlist_id not in playlist_ids: + fail(f"user_library_events.jsonl:{row_number} playlist_id {playlist_id!r} is not present in playlists.jsonl") + + +def validate_diagnostic_consistency( + diagnostics: dict[str, int], + row_counts: dict[str, int], + rows_by_dataset: dict[str, list[dict[str, Any]]], +) -> None: + liked_track_rows = sum(1 for row in rows_by_dataset["tracks"] if row.get("liked") is True) + liked_event_rows = sum(1 for row in rows_by_dataset["user_library_events"] if row.get("event_type") == "liked_track") + playlist_event_rows = sum(1 for row in rows_by_dataset["user_library_events"] if row.get("event_type") == "playlist_membership") + + expected_equalities = { + "liked_tracks_written": liked_track_rows, + "playlist_tracks_written": row_counts["playlist_tracks"], + } + for field, expected in expected_equalities.items(): + if diagnostics[field] != expected: + fail(f"_manifest.json diagnostics.{field}={diagnostics[field]} must match written row count {expected}") + + if liked_event_rows > diagnostics["liked_tracks_written"]: + fail("_manifest.json liked_track event rows cannot exceed diagnostics.liked_tracks_written") + if playlist_event_rows > diagnostics["playlist_tracks_written"]: + fail("_manifest.json playlist_membership event rows cannot exceed diagnostics.playlist_tracks_written") + if diagnostics["liked_shortcuts_seen"] != ( + diagnostics["liked_tracks_written"] + + diagnostics["liked_shortcuts_missing_track_id"] + + diagnostics["liked_tracks_duplicate_skipped"] + ): + fail("_manifest.json liked shortcut diagnostics must add up to liked_shortcuts_seen") + if diagnostics["liked_albums_seen"] != ( + diagnostics["liked_albums_written"] + + diagnostics["liked_albums_missing_id"] + + diagnostics["liked_albums_duplicate_skipped"] + ): + fail("_manifest.json liked album diagnostics must add up to liked_albums_seen") + if diagnostics["liked_artists_seen"] != ( + diagnostics["liked_artists_written"] + + diagnostics["liked_artists_missing_id"] + + diagnostics["liked_artists_duplicate_skipped"] + ): + fail("_manifest.json liked artist diagnostics must add up to liked_artists_seen") + if diagnostics["liked_playlists_seen"] != ( + diagnostics["liked_playlists_written"] + + diagnostics["liked_playlists_missing_id"] + + diagnostics["liked_playlists_duplicate_skipped"] + ): + fail("_manifest.json liked playlist diagnostics must add up to liked_playlists_seen") + if diagnostics["playlists_seen"] != diagnostics["playlists_written"] + diagnostics["playlists_missing_id"]: + fail("_manifest.json playlist diagnostics must add up to playlists_seen") + if row_counts["playlists"] != diagnostics["playlists_written"] + diagnostics["liked_playlists_written"]: + fail("_manifest.json playlist rows must match playlists_written + liked_playlists_written") + if diagnostics["playlist_tracks_seen"] != ( + diagnostics["playlist_tracks_written"] + + diagnostics["playlist_tracks_missing_track_id"] + + diagnostics["playlist_tracks_duplicate_skipped"] + ): + fail("_manifest.json playlist-track diagnostics must add up to playlist_tracks_seen") + if row_counts["tracks"] > diagnostics["liked_tracks_written"] + diagnostics["playlist_tracks_written"]: + fail("_manifest.json track rows cannot exceed liked_tracks_written + playlist_tracks_written") + + +def validate_manifest(row_counts: dict[str, int], rows_by_dataset: dict[str, list[dict[str, Any]]]) -> None: + manifest_path = RAW_DIR / "_manifest.json" + if not manifest_path.exists(): + fail(f"Missing raw manifest: {manifest_path}") + manifest = json.loads(manifest_path.read_text(encoding="utf-8")) + if "token" in json.dumps(manifest).lower(): + fail("_manifest.json must not contain token material") + if manifest.get("source") not in SOURCES: + fail(f"_manifest.json source must be one of {sorted(SOURCES)}") + if not isinstance(manifest.get("generated_at"), str) or not manifest["generated_at"].strip(): + fail("_manifest.json must contain generated_at") + adapter = manifest.get("adapter") + if not isinstance(adapter, dict): + fail("_manifest.json must contain adapter object") + for field in ["adapter_name", "adapter_version", "client_library"]: + if not isinstance(adapter.get(field), str) or not adapter[field].strip(): + fail(f"_manifest.json adapter.{field} must be a non-empty string") + diagnostics = manifest.get("diagnostics") + if not isinstance(diagnostics, dict): + fail("_manifest.json must contain diagnostics object") + for field in DIAGNOSTIC_FIELDS: + value = diagnostics.get(field) + if not isinstance(value, int) or value < 0: + fail(f"_manifest.json diagnostics.{field} must be a non-negative integer") + validate_diagnostic_consistency(diagnostics, row_counts, rows_by_dataset) + datasets = manifest.get("datasets") + if not isinstance(datasets, dict): + fail("_manifest.json must contain datasets object") + for dataset, actual_count in row_counts.items(): + dataset_manifest = datasets.get(dataset, {}) + expected_count = dataset_manifest.get("row_count") + if expected_count != actual_count: + fail(f"_manifest.json row_count mismatch for {dataset}: manifest={expected_count}, actual={actual_count}") + expected_sha256 = dataset_manifest.get("jsonl_sha256") + if not isinstance(expected_sha256, str) or len(expected_sha256) != 64: + fail(f"_manifest.json datasets.{dataset}.jsonl_sha256 must be a 64-character sha256 hex digest") + actual_sha256 = file_sha256(RAW_DIR / f"{dataset}.jsonl") + if expected_sha256 != actual_sha256: + fail(f"_manifest.json sha256 mismatch for {dataset}: manifest={expected_sha256}, actual={actual_sha256}") + + +def main() -> int: + row_counts: dict[str, int] = {} + rows_by_dataset: dict[str, list[dict[str, Any]]] = {} + for dataset in SCHEMAS: + rows = rows_for(dataset) + rows_by_dataset[dataset] = rows + row_counts[dataset] = len(rows) + for index, row in enumerate(rows, start=1): + validate_row(dataset, row, index) + validate_integrity(rows_by_dataset) + validate_manifest(row_counts, rows_by_dataset) + print("OK: Yandex Music raw schema contract is valid.") + return 0 + + +if __name__ == "__main__": + try: + raise SystemExit(main()) + except (AssertionError, json.JSONDecodeError) as error: + print(f"ERROR: {error}", file=sys.stderr) + raise SystemExit(1) diff --git a/tests/test_dashboard_actions.py b/tests/test_dashboard_actions.py new file mode 100644 index 0000000..576bf7f --- /dev/null +++ b/tests/test_dashboard_actions.py @@ -0,0 +1,59 @@ +from __future__ import annotations + +from dashboard.actions import build_data_next_actions + + +def test_build_data_next_actions_guides_sample_to_real_acceptance() -> None: + actions = build_data_next_actions( + { + "manifest_source": "sample", + "total_tracks": 3, + "known_genres": 2, + "stale_ingestion_flag": 0, + "top_artist_concentration": 0.2, + } + ) + + assert any("YANDEX_MUSIC_TOKEN" in action for action in actions) + assert any("make acceptance-real" in action for action in actions) + + +def test_build_data_next_actions_surfaces_quality_failures() -> None: + actions = build_data_next_actions( + { + "manifest_source": "yandex_music", + "total_tracks": 100, + "known_genres": 0, + "stale_ingestion_flag": 1, + "diagnostic_liked_shortcuts_fetch_failed": 4, + "diagnostic_playlist_tracks_fetch_failed": 5, + "diagnostic_playlist_tracks_missing_track_id": 2, + "diagnostic_liked_tracks_duplicate_skipped": 1, + "diagnostic_playlist_tracks_duplicate_skipped": 3, + "top_artist_concentration": 0.65, + } + ) + + assert any("stale_ingestion_flag" in action for action in actions) + assert any("liked shortcuts failed" in action for action in actions) + assert any("playlist shortcuts failed" in action for action in actions) + assert any("no stable track id" in action for action in actions) + assert any("Duplicate library rows" in action for action in actions) + assert any("Genre coverage is missing" in action for action in actions) + assert any("concentrated" in action for action in actions) + + +def test_build_data_next_actions_has_ready_state() -> None: + actions = build_data_next_actions( + { + "manifest_source": "yandex_music", + "total_tracks": 100, + "known_genres": 12, + "stale_ingestion_flag": 0, + "diagnostic_liked_shortcuts_fetch_failed": 0, + "diagnostic_playlist_tracks_missing_track_id": 0, + "top_artist_concentration": 0.2, + } + ) + + assert actions == ["Data is ready for exploration; review rediscovery tracks, playlist overlap and genre shifts."] diff --git a/tests/test_dashboard_filters.py b/tests/test_dashboard_filters.py new file mode 100644 index 0000000..ee8fe6c --- /dev/null +++ b/tests/test_dashboard_filters.py @@ -0,0 +1,35 @@ +from __future__ import annotations + +import pandas as pd + +from dashboard.filters import apply_track_filters + + +def track_frame() -> pd.DataFrame: + return pd.DataFrame( + [ + {"title": "Signal One", "artist_display": "Nadia Vector", "album_title": "Local Lake", "genre": "electronic", "liked": True}, + {"title": "Blue Warehouse", "artist_display": "Duck DB Trio", "album_title": "Warehouse Sketches", "genre": "jazz", "liked": True}, + {"title": "Quiet Branch", "artist_display": "The Lineage", "album_title": "Local Lake", "genre": "electronic", "liked": False}, + ] + ) + + +def test_apply_track_filters_by_genre_and_liked() -> None: + result = apply_track_filters(track_frame(), ["electronic"], "Liked", "") + + assert result["title"].tolist() == ["Signal One"] + + +def test_apply_track_filters_searches_title_artist_and_album_case_insensitively() -> None: + by_artist = apply_track_filters(track_frame(), ["electronic", "jazz"], "All", "nadia") + by_album = apply_track_filters(track_frame(), ["electronic", "jazz"], "All", "warehouse") + + assert by_artist["title"].tolist() == ["Signal One"] + assert by_album["title"].tolist() == ["Blue Warehouse"] + + +def test_apply_track_filters_not_liked() -> None: + result = apply_track_filters(track_frame(), ["electronic", "jazz"], "Not liked", "") + + assert result["title"].tolist() == ["Quiet Branch"] diff --git a/tests/test_yamusic_ingest.py b/tests/test_yamusic_ingest.py new file mode 100644 index 0000000..8fa5d37 --- /dev/null +++ b/tests/test_yamusic_ingest.py @@ -0,0 +1,834 @@ +from __future__ import annotations + +import json +import os +import subprocess +import sys +from dataclasses import dataclass +from pathlib import Path + +from yamusic_ingest.config import load_dotenv +from yamusic_ingest.__main__ import sample_diagnostics, status_payload +from yamusic_ingest.io import file_sha256, remove_file_if_exists, write_json, write_jsonl, write_parquet_if_available +from yamusic_ingest.sample import sample_payload +import yamusic_ingest.yandex_client as yandex_client +from yamusic_ingest.yandex_client import _sanitize_message, build_ingest_result_from_client, build_payload_from_client, client_metadata, preflight_client + + +ROOT = Path(__file__).resolve().parents[1] + + +def test_sample_payload_has_required_datasets() -> None: + payload = sample_payload() + assert set(payload) == { + "tracks", + "artists", + "albums", + "playlists", + "playlist_tracks", + "user_library_events", + } + assert payload["tracks"] + assert payload["playlist_tracks"] + + +def test_write_jsonl_round_trips_unicode(tmp_path) -> None: + rows = [{"track_id": "1", "title": "Тест"}] + path = tmp_path / "tracks.jsonl" + assert write_jsonl(path, rows) == 1 + loaded = [json.loads(line) for line in path.read_text(encoding="utf-8").splitlines()] + assert loaded == rows + + +def test_file_sha256_is_stable_for_written_jsonl(tmp_path) -> None: + path = tmp_path / "tracks.jsonl" + write_jsonl(path, [{"track_id": "1", "title": "Тест"}]) + + assert file_sha256(path) == file_sha256(path) + assert len(file_sha256(path)) == 64 + + +def test_write_json_manifest_keeps_counts_without_token_material(tmp_path) -> None: + manifest = { + "source": "sample", + "datasets": {"tracks": {"row_count": 1}}, + } + path = tmp_path / "_manifest.json" + write_json(path, manifest) + loaded = json.loads(path.read_text(encoding="utf-8")) + assert loaded == manifest + assert "token" not in path.read_text(encoding="utf-8").lower() + + +def write_contract_fixture(raw_dir: Path, payload: dict[str, list[dict[str, object]]]) -> None: + raw_dir.mkdir(parents=True, exist_ok=True) + datasets = {} + for name, rows in payload.items(): + path = raw_dir / f"{name}.jsonl" + write_jsonl(path, rows) + datasets[name] = {"row_count": len(rows), "jsonl_sha256": file_sha256(path)} + write_json( + raw_dir / "_manifest.json", + { + "generated_at": "2026-01-01T00:00:00+00:00", + "source": "sample", + "adapter": client_metadata(), + "diagnostics": sample_diagnostics(payload), + "datasets": datasets, + }, + ) + + +def run_raw_contract(raw_dir: Path) -> subprocess.CompletedProcess[str]: + env = os.environ.copy() + env["STREAMIFY_RAW_DIR"] = str(raw_dir) + return subprocess.run( + [sys.executable, "scripts/validate_yamusic_raw_contract.py"], + cwd=ROOT, + env=env, + text=True, + capture_output=True, + check=False, + ) + + +def test_raw_contract_accepts_sample_payload(tmp_path) -> None: + raw_dir = tmp_path / "raw" + write_contract_fixture(raw_dir, sample_payload()) + + result = run_raw_contract(raw_dir) + + assert result.returncode == 0, result.stderr + assert "raw schema contract is valid" in result.stdout + + +def test_raw_contract_rejects_orphan_playlist_track(tmp_path) -> None: + raw_dir = tmp_path / "raw" + payload = sample_payload() + payload["playlist_tracks"][0]["track_id"] = "missing-track" + write_contract_fixture(raw_dir, payload) + + result = run_raw_contract(raw_dir) + + assert result.returncode == 1 + assert "is not present in tracks.jsonl" in result.stderr + + +def test_raw_contract_rejects_manifest_diagnostics_mismatch(tmp_path) -> None: + raw_dir = tmp_path / "raw" + write_contract_fixture(raw_dir, sample_payload()) + manifest_path = raw_dir / "_manifest.json" + manifest = json.loads(manifest_path.read_text(encoding="utf-8")) + manifest["diagnostics"]["playlist_tracks_written"] += 1 + manifest_path.write_text(json.dumps(manifest), encoding="utf-8") + + result = run_raw_contract(raw_dir) + + assert result.returncode == 1 + assert "diagnostics.playlist_tracks_written" in result.stderr + + +def test_raw_contract_allows_fetch_failure_counters_for_written_fallback_rows(tmp_path) -> None: + raw_dir = tmp_path / "raw" + write_contract_fixture(raw_dir, sample_payload()) + manifest_path = raw_dir / "_manifest.json" + manifest = json.loads(manifest_path.read_text(encoding="utf-8")) + manifest["diagnostics"]["liked_shortcuts_fetch_failed"] = 1 + manifest["diagnostics"]["playlist_tracks_fetch_failed"] = 1 + manifest_path.write_text(json.dumps(manifest), encoding="utf-8") + + result = run_raw_contract(raw_dir) + + assert result.returncode == 0, result.stderr + + +def test_raw_contract_rejects_jsonl_checksum_mismatch(tmp_path) -> None: + raw_dir = tmp_path / "raw" + write_contract_fixture(raw_dir, sample_payload()) + with (raw_dir / "tracks.jsonl").open("a", encoding="utf-8") as file: + file.write("\n") + + result = run_raw_contract(raw_dir) + + assert result.returncode == 1 + assert "sha256 mismatch for tracks" in result.stderr + + +def test_empty_parquet_write_removes_stale_file(tmp_path) -> None: + path = tmp_path / "tracks.parquet" + path.write_bytes(b"stale") + + assert write_parquet_if_available(path, []) is False + + assert not path.exists() + + +def test_remove_file_if_exists_is_idempotent(tmp_path) -> None: + path = tmp_path / "tracks.parquet" + + remove_file_if_exists(path) + path.write_bytes(b"stale") + remove_file_if_exists(path) + + assert not path.exists() + + +def test_preflight_client_returns_safe_counts() -> None: + client = FakeClient(liked_tracks=[object(), object()], playlists=[]) + result = preflight_client(client) + assert result["source"] == "yandex_music" + assert result["status"] == "ok" + assert result["liked_shortcut_count"] == 2 + assert result["liked_album_count"] == 0 + assert result["liked_artist_count"] == 0 + assert result["liked_playlist_count"] == 0 + assert result["playlist_count"] == 0 + assert result["adapter_name"] == "yamusic_ingest" + assert result["adapter_version"] + assert result["client_library"] == "yandex-music" + assert "token" not in json.dumps(result).lower() + + +def test_sanitize_message_redacts_token() -> None: + assert _sanitize_message("bad token secret-123", "secret-123") == "bad token [redacted-token]" + + +def test_load_dotenv_reads_export_and_preserves_existing_env(tmp_path, monkeypatch) -> None: + env_path = tmp_path / ".env" + env_path.write_text( + "\n".join( + [ + "# local settings", + "export YANDEX_MUSIC_TOKEN=from-file", + "STREAMIFY_RAW_DIR='data/custom/raw'", + "STREAMIFY_DUCKDB_PATH=\"data/custom.duckdb\"", + ] + ), + encoding="utf-8", + ) + monkeypatch.setenv("YANDEX_MUSIC_TOKEN", "from-shell") + monkeypatch.delenv("STREAMIFY_RAW_DIR", raising=False) + monkeypatch.delenv("STREAMIFY_DUCKDB_PATH", raising=False) + + load_dotenv(env_path) + + assert os.environ["YANDEX_MUSIC_TOKEN"] == "from-shell" + assert os.environ["STREAMIFY_RAW_DIR"] == "data/custom/raw" + assert os.environ["STREAMIFY_DUCKDB_PATH"] == "data/custom.duckdb" + + +def test_load_dotenv_preserves_token_special_characters(tmp_path, monkeypatch) -> None: + env_path = tmp_path / ".env" + env_path.write_text("YANDEX_MUSIC_TOKEN='ya#token$with=chars'\n", encoding="utf-8") + monkeypatch.delenv("YANDEX_MUSIC_TOKEN", raising=False) + + load_dotenv(env_path) + + assert os.environ["YANDEX_MUSIC_TOKEN"] == "ya#token$with=chars" + + +def test_status_payload_does_not_expose_token_material(tmp_path, monkeypatch) -> None: + monkeypatch.chdir(tmp_path) + settings = type("SettingsLike", (), {"token": "secret-token", "raw_dir": Path("raw")})() + + payload = status_payload(settings) + serialized = json.dumps(payload) + + assert payload["token_configured"] is True + assert "secret-token" not in serialized + assert payload["next_step"] == "make preflight" + assert payload["snapshot_path"] == "data/streamify_snapshot.json" + assert payload["recommendations_dir"] == "data/recommendations" + + +def test_status_payload_reads_last_manifest_without_api_call(tmp_path, monkeypatch) -> None: + monkeypatch.chdir(tmp_path) + raw_dir = tmp_path / "raw" + raw_dir.mkdir() + (raw_dir / "_manifest.json").write_text( + json.dumps({"source": "yandex_music", "generated_at": "2026-06-15T12:00:00+00:00"}), + encoding="utf-8", + ) + settings = type("SettingsLike", (), {"token": "secret-token", "raw_dir": raw_dir})() + + payload = status_payload(settings) + serialized = json.dumps(payload) + + assert payload["last_source"] == "yandex_music" + assert payload["last_generated_at"] == "2026-06-15T12:00:00+00:00" + assert payload["manifest_read_error"] is None + assert payload["next_step"] == "make acceptance-real" + assert "secret-token" not in serialized + + +def test_status_payload_reports_broken_manifest_without_failing(tmp_path, monkeypatch) -> None: + monkeypatch.chdir(tmp_path) + raw_dir = tmp_path / "raw" + raw_dir.mkdir() + (raw_dir / "_manifest.json").write_text("{not-json", encoding="utf-8") + settings = type("SettingsLike", (), {"token": None, "raw_dir": raw_dir})() + + payload = status_payload(settings) + + assert payload["raw_manifest_exists"] is True + assert payload["last_source"] is None + assert payload["manifest_read_error"] == "JSONDecodeError" + + +@dataclass +class FakeLikesResponse: + tracks: list[object] + + +@dataclass +class FakeShortcut: + track: object + timestamp: str + + def fetch_track(self) -> object: + return self.track + + +@dataclass +class FakePlaylistItem: + track: object + created_at: str + + +@dataclass +class FakePlaylist: + kind: int + title: str + track_count: int + tracks: list[FakePlaylistItem] + uid: int | None = None + owner: object | None = None + + def fetch_tracks(self) -> "FakePlaylist": + return self + + +@dataclass +class FakeOwner: + uid: int + + +class FakeClient: + def __init__( + self, + liked_tracks: list[object], + playlists: list[FakePlaylist], + liked_albums: list[object] | None = None, + liked_artists: list[object] | None = None, + liked_playlists: list[object] | None = None, + ) -> None: + self._liked_tracks = liked_tracks + self._playlists = playlists + self._liked_albums = liked_albums or [] + self._liked_artists = liked_artists or [] + self._liked_playlists = liked_playlists or [] + + def users_likes_tracks(self) -> FakeLikesResponse: + return FakeLikesResponse(self._liked_tracks) + + def users_playlists_list(self) -> list[FakePlaylist]: + return self._playlists + + def users_likes_albums(self) -> list[object]: + return self._liked_albums + + def users_likes_artists(self) -> list[object]: + return self._liked_artists + + def users_likes_playlists(self) -> list[object]: + return self._liked_playlists + + +class FlakyClient(FakeClient): + def __init__(self, liked_tracks: list[object], playlists: list[FakePlaylist]) -> None: + super().__init__(liked_tracks, playlists) + self.likes_calls = 0 + self.playlist_calls = 0 + + def users_likes_tracks(self) -> FakeLikesResponse: + self.likes_calls += 1 + if self.likes_calls == 1: + raise RuntimeError("temporary likes failure") + return super().users_likes_tracks() + + def users_playlists_list(self) -> list[FakePlaylist]: + self.playlist_calls += 1 + if self.playlist_calls == 1: + raise RuntimeError("temporary playlists failure") + return super().users_playlists_list() + + +class FlakyShortcut(FakeShortcut): + def __init__(self, track: object, timestamp: str) -> None: + super().__init__(track, timestamp) + self.calls = 0 + + def fetch_track(self) -> object: + self.calls += 1 + if self.calls == 1: + raise RuntimeError("temporary track failure") + return self.track + + +class AlwaysFailingShortcut(FakeShortcut): + def fetch_track(self) -> object: + raise RuntimeError("permanent track failure") + + +@dataclass +class FakeLikedShortcutOnlyId: + id: str + timestamp: str + + def fetch_track(self) -> object: + raise RuntimeError("liked shortcut hydration failed") + + +@dataclass +class FakePlaylistReturningTrackList(FakePlaylist): + fetched_tracks: list[object] | None = None + + def fetch_tracks(self) -> list[object]: + return self.fetched_tracks or [] + + +@dataclass +class FakePlaylistTrackShortcut: + id: str + timestamp: str + track: object | None = None + + def fetch_track(self) -> object: + if self.track is None: + raise RuntimeError("playlist shortcut hydration failed") + return self.track + + +def test_build_payload_from_client_normalizes_liked_tracks_playlists_and_timestamps() -> None: + album = {"id": 101, "title": "Adapter Album", "genre": "indie", "year": 2025} + artist = {"id": 201, "name": "Adapter Artist"} + liked_track = { + "id": "t-1", + "title": "Liked Track", + "duration_ms": 123000, + "albums": [album], + "artists": [artist], + "major": {"name": "Adapter Label"}, + } + playlist_track = { + "id": "t-2", + "title": "Playlist Track", + "durationMs": 234000, + "albums": [album], + "artists": [artist], + } + client = FakeClient( + liked_tracks=[FakeShortcut(liked_track, "2026-01-01T10:00:00+00:00")], + playlists=[ + FakePlaylist( + kind=77, + title="Adapter Playlist", + track_count=1, + tracks=[FakePlaylistItem(playlist_track, "2026-02-01T11:00:00+00:00")], + ) + ], + ) + + payload = build_payload_from_client(client) + + assert set(payload) == { + "tracks", + "artists", + "albums", + "playlists", + "playlist_tracks", + "user_library_events", + } + assert {track["track_id"] for track in payload["tracks"]} == {"t-1", "t-2"} + liked = next(track for track in payload["tracks"] if track["track_id"] == "t-1") + assert liked["liked"] is True + assert liked["genre"] == "indie" + assert liked["label"] == "Adapter Label" + assert payload["albums"] == [ + { + "album_id": "101", + "album_title": "Adapter Album", + "genre": "indie", + "release_year": 2025, + "source": "yandex_music", + "ingested_at": payload["albums"][0]["ingested_at"], + } + ] + assert payload["playlist_tracks"][0]["added_at"] == "2026-02-01T11:00:00+00:00" + events_by_id = {event["event_id"]: event for event in payload["user_library_events"]} + assert events_by_id["liked_track:t-1"]["event_ts"] == "2026-01-01T10:00:00+00:00" + assert events_by_id["playlist_membership:77:t-2"]["event_ts"] == "2026-02-01T11:00:00+00:00" + + +def test_build_payload_from_client_includes_liked_album_and_artist_metadata() -> None: + client = FakeClient( + liked_tracks=[], + playlists=[], + liked_albums=[ + { + "album": { + "id": "liked-album-1", + "title": "Liked Album", + "genre": "ambient", + "original_release_year": 2022, + } + }, + {"album": {"title": "Missing Id"}}, + ], + liked_artists=[ + {"artist": {"id": "liked-artist-1", "name": "Liked Artist"}}, + {"artist": {"name": "Missing Id"}}, + ], + ) + + result = build_ingest_result_from_client(client) + + assert result.payload["albums"] == [ + { + "album_id": "liked-album-1", + "album_title": "Liked Album", + "genre": "ambient", + "release_year": 2022, + "source": "yandex_music", + "ingested_at": result.payload["albums"][0]["ingested_at"], + } + ] + assert result.payload["artists"] == [ + { + "artist_id": "liked-artist-1", + "artist_name": "Liked Artist", + "source": "yandex_music", + "ingested_at": result.payload["artists"][0]["ingested_at"], + } + ] + assert result.diagnostics["liked_albums_seen"] == 2 + assert result.diagnostics["liked_albums_written"] == 1 + assert result.diagnostics["liked_albums_missing_id"] == 1 + assert result.diagnostics["liked_artists_seen"] == 2 + assert result.diagnostics["liked_artists_written"] == 1 + assert result.diagnostics["liked_artists_missing_id"] == 1 + + +def test_build_payload_from_client_includes_liked_playlist_metadata_and_dedupes_owned_playlists() -> None: + owned_playlist = FakePlaylist( + uid=100, + kind=10, + title="Owned Playlist", + track_count=0, + tracks=[], + ) + liked_playlist = { + "playlist": FakePlaylist( + uid=200, + kind=20, + title="Liked Playlist", + track_count=12, + tracks=[], + ) + } + duplicate_liked_playlist = {"playlist": owned_playlist} + missing_id_liked_playlist = {"playlist": {"title": "Missing Id"}} + client = FakeClient( + liked_tracks=[], + playlists=[owned_playlist], + liked_playlists=[liked_playlist, duplicate_liked_playlist, missing_id_liked_playlist], + ) + + result = build_ingest_result_from_client(client) + + assert [row["playlist_id"] for row in result.payload["playlists"]] == ["100:10", "200:20"] + assert result.payload["playlists"][1]["playlist_title"] == "Liked Playlist" + assert result.payload["playlists"][1]["track_count"] == 12 + assert result.diagnostics["playlists_seen"] == 1 + assert result.diagnostics["playlists_written"] == 1 + assert result.diagnostics["liked_playlists_seen"] == 3 + assert result.diagnostics["liked_playlists_written"] == 1 + assert result.diagnostics["liked_playlists_missing_id"] == 1 + assert result.diagnostics["liked_playlists_duplicate_skipped"] == 1 + + +def test_build_payload_uses_playlist_owner_uid_when_playlist_uid_is_missing() -> None: + track = { + "id": "owner-playlist-track", + "title": "Owner Playlist Track", + "duration_ms": 123000, + "albums": [], + "artists": [], + } + playlist = FakePlaylist( + uid=None, + owner=FakeOwner(uid=4242), + kind=55, + title="Owner Scoped Playlist", + track_count=1, + tracks=[FakePlaylistItem(track, "2026-06-04T10:00:00+00:00")], + ) + client = FakeClient(liked_tracks=[], playlists=[playlist]) + + payload = build_payload_from_client(client) + + assert payload["playlists"][0]["playlist_id"] == "4242:55" + assert payload["playlist_tracks"][0]["playlist_id"] == "4242:55" + assert payload["user_library_events"][0]["event_id"] == "playlist_membership:4242:55:owner-playlist-track" + + +def test_preflight_client_retries_transient_top_level_failures(monkeypatch) -> None: + monkeypatch.setattr(yandex_client, "sleep", lambda _seconds: None) + client = FlakyClient(liked_tracks=[object()], playlists=[]) + + result = preflight_client(client) + + assert result["liked_shortcut_count"] == 1 + assert result["playlist_count"] == 0 + assert client.likes_calls == 2 + assert client.playlist_calls == 2 + + +def test_build_payload_retries_transient_track_fetch(monkeypatch) -> None: + monkeypatch.setattr(yandex_client, "sleep", lambda _seconds: None) + track = { + "id": "flaky-track", + "title": "Retry Track", + "duration_ms": 123000, + "albums": [], + "artists": [{"id": "retry-artist", "name": "Retry Artist"}], + } + shortcut = FlakyShortcut(track, "2026-04-01T10:00:00+00:00") + client = FakeClient(liked_tracks=[shortcut], playlists=[]) + + payload = build_payload_from_client(client) + + assert shortcut.calls == 2 + assert [row["track_id"] for row in payload["tracks"]] == ["flaky-track"] + assert payload["user_library_events"][0]["event_id"] == "liked_track:flaky-track" + + +def test_build_ingest_result_reports_safe_skip_diagnostics(monkeypatch) -> None: + monkeypatch.setattr(yandex_client, "sleep", lambda _seconds: None) + good_track = { + "id": "good-track", + "title": "Good Track", + "duration_ms": 123000, + "albums": [], + "artists": [{"id": "good-artist", "name": "Good Artist"}], + } + missing_id_track = {"title": "No Id"} + failed_shortcut = AlwaysFailingShortcut(good_track, "2026-05-01T10:00:00+00:00") + playlist = FakePlaylist( + kind=88, + title="Diagnostics Playlist", + track_count=2, + tracks=[ + FakePlaylistItem(good_track, "2026-05-02T10:00:00+00:00"), + FakePlaylistItem(missing_id_track, "2026-05-03T10:00:00+00:00"), + ], + ) + client = FakeClient( + liked_tracks=[ + FakeShortcut(good_track, "2026-05-01T10:00:00+00:00"), + FakeShortcut(missing_id_track, "2026-05-01T11:00:00+00:00"), + failed_shortcut, + ], + playlists=[{"title": "No Stable Id"}, playlist], + ) + + result = build_ingest_result_from_client(client) + + assert result.diagnostics["liked_shortcuts_seen"] == 3 + assert result.diagnostics["liked_tracks_written"] == 1 + assert result.diagnostics["liked_shortcuts_fetch_failed"] == 1 + assert result.diagnostics["liked_shortcuts_missing_track_id"] == 1 + assert result.diagnostics["liked_tracks_duplicate_skipped"] == 1 + assert result.diagnostics["playlists_seen"] == 2 + assert result.diagnostics["playlists_written"] == 1 + assert result.diagnostics["playlists_missing_id"] == 1 + assert result.diagnostics["playlist_tracks_seen"] == 2 + assert result.diagnostics["playlist_tracks_written"] == 1 + assert result.diagnostics["playlist_tracks_missing_track_id"] == 1 + + +def test_build_ingest_result_keeps_liked_shortcut_id_when_hydration_fails(monkeypatch) -> None: + monkeypatch.setattr(yandex_client, "sleep", lambda _seconds: None) + shortcut = FakeLikedShortcutOnlyId(id="shortcut-only-track", timestamp="2026-06-03T09:00:00+00:00") + client = FakeClient(liked_tracks=[shortcut], playlists=[]) + + result = build_ingest_result_from_client(client) + + assert result.payload["tracks"] == [ + { + "track_id": "shortcut-only-track", + "title": "", + "duration_ms": None, + "album_id": None, + "album_title": None, + "genre": None, + "release_year": None, + "label": None, + "artist_ids": [], + "artist_names": [], + "liked": True, + "source": "yandex_music", + "ingested_at": result.payload["tracks"][0]["ingested_at"], + "liked_at": "2026-06-03T09:00:00+00:00", + } + ] + assert result.payload["user_library_events"][0]["event_id"] == "liked_track:shortcut-only-track" + assert result.payload["user_library_events"][0]["event_ts"] == "2026-06-03T09:00:00+00:00" + assert result.diagnostics["liked_shortcuts_seen"] == 1 + assert result.diagnostics["liked_tracks_written"] == 1 + assert result.diagnostics["liked_shortcuts_fetch_failed"] == 1 + assert result.diagnostics["liked_shortcuts_missing_track_id"] == 0 + + +def test_build_ingest_result_deduplicates_repeated_library_rows() -> None: + track = { + "id": "dup-track", + "title": "Duplicate Track", + "duration_ms": 123000, + "albums": [], + "artists": [{"id": "dup-artist", "name": "Duplicate Artist"}], + } + playlist = FakePlaylist( + kind=99, + title="Duplicate Playlist", + track_count=2, + tracks=[ + FakePlaylistItem(track, "2026-06-01T10:00:00+00:00"), + FakePlaylistItem(track, "2026-06-01T10:01:00+00:00"), + ], + ) + client = FakeClient( + liked_tracks=[ + FakeShortcut(track, "2026-06-01T09:00:00+00:00"), + FakeShortcut(track, "2026-06-01T09:01:00+00:00"), + ], + playlists=[playlist], + ) + + result = build_ingest_result_from_client(client) + + assert [row["track_id"] for row in result.payload["tracks"]] == ["dup-track"] + assert result.payload["playlist_tracks"] == [ + { + "playlist_id": "99", + "track_id": "dup-track", + "position": 1, + "added_at": "2026-06-01T10:00:00+00:00", + "source": "yandex_music", + "ingested_at": result.payload["playlist_tracks"][0]["ingested_at"], + } + ] + assert {event["event_id"] for event in result.payload["user_library_events"]} == { + "liked_track:dup-track", + "playlist_membership:99:dup-track", + } + assert result.diagnostics["liked_shortcuts_seen"] == 2 + assert result.diagnostics["liked_tracks_written"] == 1 + assert result.diagnostics["liked_tracks_duplicate_skipped"] == 1 + assert result.diagnostics["playlist_tracks_seen"] == 2 + assert result.diagnostics["playlist_tracks_written"] == 1 + assert result.diagnostics["playlist_tracks_duplicate_skipped"] == 1 + + +def test_build_ingest_result_accepts_playlist_fetch_tracks_list_and_hydrates_shortcuts(monkeypatch) -> None: + monkeypatch.setattr(yandex_client, "sleep", lambda _seconds: None) + full_track = { + "id": "list-track-full", + "title": "List Track Full", + "duration_ms": 210000, + "albums": [{"id": "list-album", "title": "List Album", "genre": "jazz"}], + "artists": [{"id": "list-artist", "name": "List Artist"}], + } + shortcut_only = FakePlaylistTrackShortcut(id="list-track-shortcut", timestamp="2026-06-02T10:00:00+00:00") + playlist = FakePlaylistReturningTrackList( + kind=101, + title="Fetched List Playlist", + track_count=2, + tracks=[], + fetched_tracks=[ + FakePlaylistTrackShortcut(id="list-track-full", timestamp="2026-06-02T09:00:00+00:00", track=full_track), + shortcut_only, + ], + ) + client = FakeClient(liked_tracks=[], playlists=[playlist]) + + result = build_ingest_result_from_client(client) + + tracks_by_id = {row["track_id"]: row for row in result.payload["tracks"]} + assert set(tracks_by_id) == {"list-track-full", "list-track-shortcut"} + assert tracks_by_id["list-track-full"]["artist_names"] == ["List Artist"] + assert tracks_by_id["list-track-shortcut"]["title"] == "" + assert [row["track_id"] for row in result.payload["playlist_tracks"]] == ["list-track-full", "list-track-shortcut"] + assert result.diagnostics["playlist_tracks_seen"] == 2 + assert result.diagnostics["playlist_tracks_written"] == 2 + assert result.diagnostics["playlist_tracks_fetch_failed"] == 1 + assert result.diagnostics["playlist_tracks_missing_track_id"] == 0 + + +def test_build_payload_from_client_handles_empty_account() -> None: + payload = build_payload_from_client(FakeClient(liked_tracks=[], playlists=[])) + assert payload == { + "tracks": [], + "artists": [], + "albums": [], + "playlists": [], + "playlist_tracks": [], + "user_library_events": [], + } + + +def test_build_payload_from_yandex_music_model_instances_and_skips_missing_track_ids() -> None: + from yandex_music import Album, Artist, Playlist, Track + + artist = Artist(id=301, name="Model Artist", genres=["post-rock"]) + album = Album( + id=401, + title="Model Album", + labels=["Model Label"], + original_release_year=2024, + ) + liked_track = Track(id="model-1", title="Model Track", artists=[artist], albums=[album], duration_ms=210000) + playlist_track = Track(id="model-2", title="Duration Seconds", artists=[artist], albums=[album]) + playlist_track.duration = 187 + no_id_track = {"title": "No Id"} + + client = FakeClient( + liked_tracks=[FakeShortcut(liked_track, "2026-03-01T10:00:00+00:00"), FakeShortcut(no_id_track, "2026-03-02T10:00:00+00:00")], + playlists=[ + Playlist( + uid=500, + kind=600, + title="Model Playlist", + track_count=2, + tracks=[FakePlaylistItem(playlist_track, "2026-03-03T10:00:00+00:00"), FakePlaylistItem(no_id_track, "2026-03-04T10:00:00+00:00")], + owner=None, + cover=None, + made_for=None, + play_counter=None, + playlist_absence=None, + ) + ], + ) + + payload = build_payload_from_client(client) + + assert {track["track_id"] for track in payload["tracks"]} == {"model-1", "model-2"} + liked = next(track for track in payload["tracks"] if track["track_id"] == "model-1") + duration_fallback = next(track for track in payload["tracks"] if track["track_id"] == "model-2") + assert liked["genre"] == "post-rock" + assert liked["release_year"] == 2024 + assert liked["label"] == "Model Label" + assert duration_fallback["duration_ms"] == 187000 + assert payload["playlists"][0]["playlist_id"] == "500:600" + assert len(payload["playlist_tracks"]) == 1 + assert len(payload["user_library_events"]) == 2 diff --git a/yamusic_ingest/__init__.py b/yamusic_ingest/__init__.py new file mode 100644 index 0000000..7e6ee18 --- /dev/null +++ b/yamusic_ingest/__init__.py @@ -0,0 +1,5 @@ +"""Yandex Music metadata ingestion for local Streamify analytics.""" + +__all__ = ["__version__"] + +__version__ = "0.1.0" diff --git a/yamusic_ingest/__main__.py b/yamusic_ingest/__main__.py new file mode 100644 index 0000000..952e840 --- /dev/null +++ b/yamusic_ingest/__main__.py @@ -0,0 +1,204 @@ +from __future__ import annotations + +import argparse +import json +import os +from pathlib import Path +import sys +from datetime import UTC, datetime + +from yamusic_ingest.config import Settings +from yamusic_ingest.io import file_sha256, remove_file_if_exists, write_json, write_jsonl, write_parquet_if_available +from yamusic_ingest.sample import sample_payload +from yamusic_ingest.yandex_client import YandexMusicIngestError, client_metadata, fetch_ingest_result, preflight_token + + +DIAGNOSTIC_FIELDS = [ + "liked_shortcuts_seen", + "liked_tracks_written", + "liked_shortcuts_fetch_failed", + "liked_shortcuts_missing_track_id", + "liked_tracks_duplicate_skipped", + "liked_albums_seen", + "liked_albums_written", + "liked_albums_missing_id", + "liked_albums_duplicate_skipped", + "liked_artists_seen", + "liked_artists_written", + "liked_artists_missing_id", + "liked_artists_duplicate_skipped", + "liked_playlists_seen", + "liked_playlists_written", + "liked_playlists_missing_id", + "liked_playlists_duplicate_skipped", + "playlists_seen", + "playlists_written", + "playlists_missing_id", + "playlist_fetch_fallbacks", + "playlist_tracks_seen", + "playlist_tracks_written", + "playlist_tracks_fetch_failed", + "playlist_tracks_missing_track_id", + "playlist_tracks_duplicate_skipped", +] + + +def sample_diagnostics(payload: dict[str, list[dict[str, object]]]) -> dict[str, int]: + return { + "liked_shortcuts_seen": sum(1 for row in payload["tracks"] if row.get("liked")), + "liked_tracks_written": sum(1 for row in payload["tracks"] if row.get("liked")), + "liked_shortcuts_fetch_failed": 0, + "liked_shortcuts_missing_track_id": 0, + "liked_tracks_duplicate_skipped": 0, + "liked_albums_seen": 0, + "liked_albums_written": 0, + "liked_albums_missing_id": 0, + "liked_albums_duplicate_skipped": 0, + "liked_artists_seen": 0, + "liked_artists_written": 0, + "liked_artists_missing_id": 0, + "liked_artists_duplicate_skipped": 0, + "liked_playlists_seen": 0, + "liked_playlists_written": 0, + "liked_playlists_missing_id": 0, + "liked_playlists_duplicate_skipped": 0, + "playlists_seen": len(payload["playlists"]), + "playlists_written": len(payload["playlists"]), + "playlists_missing_id": 0, + "playlist_fetch_fallbacks": 0, + "playlist_tracks_seen": len(payload["playlist_tracks"]), + "playlist_tracks_written": len(payload["playlist_tracks"]), + "playlist_tracks_fetch_failed": 0, + "playlist_tracks_missing_track_id": 0, + "playlist_tracks_duplicate_skipped": 0, + } + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(description="Ingest Yandex Music metadata for local Streamify analytics.") + parser.add_argument("--sample", action="store_true", help="Write deterministic sample data instead of calling Yandex Music.") + parser.add_argument("--json-only", action="store_true", help="Skip optional Parquet output.") + parser.add_argument("--preflight", action="store_true", help="Check real Yandex Music token access without writing raw data.") + parser.add_argument("--status", action="store_true", help="Print local configuration status without calling Yandex Music or writing data.") + return parser.parse_args() + + +def status_payload(settings: Settings) -> dict[str, object]: + duckdb_path = Path(os.getenv("STREAMIFY_DUCKDB_PATH", "data/streamify.duckdb")) + report_path = Path(os.getenv("STREAMIFY_REPORT_PATH", "data/streamify_summary.md")) + snapshot_path = Path(os.getenv("STREAMIFY_SNAPSHOT_PATH", "data/streamify_snapshot.json")) + recommendations_dir = Path(os.getenv("STREAMIFY_RECOMMENDATIONS_DIR", "data/recommendations")) + raw_manifest = settings.raw_dir / "_manifest.json" + manifest_source = None + manifest_generated_at = None + manifest_read_error = None + if raw_manifest.exists(): + try: + manifest = json.loads(raw_manifest.read_text(encoding="utf-8")) + manifest_source = manifest.get("source") + manifest_generated_at = manifest.get("generated_at") + except (OSError, json.JSONDecodeError) as exc: + manifest_read_error = exc.__class__.__name__ + if settings.token: + next_step = "make acceptance-real" if manifest_source == "yandex_music" else "make preflight" + else: + next_step = "set YANDEX_MUSIC_TOKEN in .env or run make ingest-sample" + return { + "env_file_present": Path(".env").exists(), + "token_configured": bool(settings.token), + "raw_dir": str(settings.raw_dir), + "raw_manifest_exists": raw_manifest.exists(), + "last_source": manifest_source, + "last_generated_at": manifest_generated_at, + "manifest_read_error": manifest_read_error, + "duckdb_path": str(duckdb_path), + "duckdb_exists": duckdb_path.exists(), + "report_path": str(report_path), + "report_exists": report_path.exists(), + "snapshot_path": str(snapshot_path), + "snapshot_exists": snapshot_path.exists(), + "recommendations_dir": str(recommendations_dir), + "recommendations_exists": recommendations_dir.exists(), + "next_step": next_step, + } + + +def main() -> int: + args = parse_args() + settings = Settings.from_env(sample=args.sample) + + if args.status: + if args.sample: + print("--status reports local configuration; do not combine it with --sample.", file=sys.stderr) + return 2 + print(json.dumps(status_payload(settings), ensure_ascii=False, indent=2, sort_keys=True)) + return 0 + + settings.raw_dir.mkdir(parents=True, exist_ok=True) + + if args.preflight: + if args.sample: + print("--preflight checks a real Yandex Music token; do not combine it with --sample.", file=sys.stderr) + return 2 + if not settings.token: + print("YANDEX_MUSIC_TOKEN is not set. Add it to .env before running --preflight.", file=sys.stderr) + return 2 + try: + print(json.dumps(preflight_token(settings.token), ensure_ascii=False, indent=2, sort_keys=True)) + except YandexMusicIngestError as exc: + print(f"Yandex Music preflight failed: {exc}", file=sys.stderr) + return 1 + return 0 + + source = "sample" if args.sample else "yandex_music" + if args.sample: + payload = sample_payload() + diagnostics = sample_diagnostics(payload) + else: + if not settings.token: + print("YANDEX_MUSIC_TOKEN is not set. Use --sample for a local demo or add the token to .env.", file=sys.stderr) + return 2 + try: + result = fetch_ingest_result(settings.token) + payload = result.payload + diagnostics = result.diagnostics + except YandexMusicIngestError as exc: + print(f"Yandex Music ingestion failed: {exc}", file=sys.stderr) + return 1 + + diagnostics = {field: int(diagnostics.get(field, 0)) for field in DIAGNOSTIC_FIELDS} + + manifest = { + "generated_at": datetime.now(UTC).isoformat(), + "source": source, + "raw_dir": str(settings.raw_dir), + "json_only": args.json_only, + "adapter": client_metadata(), + "diagnostics": diagnostics, + "datasets": {}, + } + for name, rows in payload.items(): + jsonl_path = settings.raw_dir / f"{name}.jsonl" + parquet_path = settings.raw_dir / f"{name}.parquet" + count = write_jsonl(jsonl_path, rows) + parquet_written = False + if not args.json_only: + parquet_written = write_parquet_if_available(parquet_path, rows) + else: + remove_file_if_exists(parquet_path) + manifest["datasets"][name] = { + "jsonl_path": str(jsonl_path), + "row_count": count, + "jsonl_sha256": file_sha256(jsonl_path), + "parquet_written": parquet_written, + } + suffix = " + parquet" if parquet_written else "" + print(f"wrote {count:>5} rows to {jsonl_path}{suffix}") + + write_json(settings.raw_dir / "_manifest.json", manifest) + print(f"wrote manifest to {settings.raw_dir / '_manifest.json'}") + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/yamusic_ingest/config.py b/yamusic_ingest/config.py new file mode 100644 index 0000000..2be9717 --- /dev/null +++ b/yamusic_ingest/config.py @@ -0,0 +1,36 @@ +from __future__ import annotations + +import os +from dataclasses import dataclass +from pathlib import Path + + +def load_dotenv(path: Path = Path(".env")) -> None: + if not path.exists(): + return + for raw_line in path.read_text(encoding="utf-8").splitlines(): + line = raw_line.strip() + if not line or line.startswith("#") or "=" not in line: + continue + if line.startswith("export "): + line = line.removeprefix("export ").strip() + key, value = line.split("=", 1) + key = key.strip() + value = value.strip().strip('"').strip("'") + if key: + os.environ.setdefault(key, value) + + +@dataclass(frozen=True) +class Settings: + token: str | None + raw_dir: Path + sample: bool = False + + @classmethod + def from_env(cls, sample: bool = False) -> "Settings": + load_dotenv() + data_dir = Path(os.getenv("STREAMIFY_DATA_DIR", "data")) + raw_dir = Path(os.getenv("STREAMIFY_RAW_DIR", str(data_dir / "raw" / "yamusic"))) + token = os.getenv("YANDEX_MUSIC_TOKEN") or None + return cls(token=token, raw_dir=raw_dir, sample=sample) diff --git a/yamusic_ingest/io.py b/yamusic_ingest/io.py new file mode 100644 index 0000000..d839aaa --- /dev/null +++ b/yamusic_ingest/io.py @@ -0,0 +1,51 @@ +from __future__ import annotations + +import json +from hashlib import sha256 +from collections.abc import Iterable +from pathlib import Path +from typing import Any + + +def write_jsonl(path: Path, rows: Iterable[dict[str, Any]]) -> int: + path.parent.mkdir(parents=True, exist_ok=True) + count = 0 + with path.open("w", encoding="utf-8") as file: + for row in rows: + file.write(json.dumps(row, ensure_ascii=False, sort_keys=True) + "\n") + count += 1 + return count + + +def write_json(path: Path, payload: dict[str, Any]) -> None: + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text(json.dumps(payload, ensure_ascii=False, indent=2, sort_keys=True) + "\n", encoding="utf-8") + + +def file_sha256(path: Path) -> str: + digest = sha256() + with path.open("rb") as file: + for chunk in iter(lambda: file.read(1024 * 1024), b""): + digest.update(chunk) + return digest.hexdigest() + + +def remove_file_if_exists(path: Path) -> None: + if path.exists(): + path.unlink() + + +def write_parquet_if_available(path: Path, rows: list[dict[str, Any]]) -> bool: + remove_file_if_exists(path) + if not rows: + return False + try: + import pyarrow as pa + import pyarrow.parquet as pq + except ImportError: + return False + + path.parent.mkdir(parents=True, exist_ok=True) + table = pa.Table.from_pylist(rows) + pq.write_table(table, path) + return True diff --git a/yamusic_ingest/sample.py b/yamusic_ingest/sample.py new file mode 100644 index 0000000..3ec7314 --- /dev/null +++ b/yamusic_ingest/sample.py @@ -0,0 +1,91 @@ +from __future__ import annotations + +from datetime import datetime, timezone +from typing import Any + + +def sample_payload() -> dict[str, list[dict[str, Any]]]: + ingested_at = datetime.now(timezone.utc).isoformat() + winter = "2026-01-12T19:30:00+00:00" + spring = "2026-04-08T08:15:00+00:00" + summer = "2026-06-10T22:05:00+00:00" + tracks = [ + { + "track_id": "sample-track-1", + "title": "Midnight Local", + "duration_ms": 213000, + "album_id": "sample-album-1", + "album_title": "Local Lake", + "genre": "electronic", + "release_year": 2024, + "label": "Streamify Lab", + "artist_ids": ["sample-artist-1"], + "artist_names": ["Nadia Vector"], + "liked": True, + "source": "sample", + "ingested_at": ingested_at, + }, + { + "track_id": "sample-track-2", + "title": "Parquet Morning", + "duration_ms": 188000, + "album_id": "sample-album-2", + "album_title": "Warehouse Sketches", + "genre": "jazz", + "release_year": 2023, + "label": "Local First", + "artist_ids": ["sample-artist-2"], + "artist_names": ["Duck DB Trio"], + "liked": True, + "source": "sample", + "ingested_at": ingested_at, + }, + { + "track_id": "sample-track-3", + "title": "Repeat Signal", + "duration_ms": 241000, + "album_id": "sample-album-1", + "album_title": "Local Lake", + "genre": "electronic", + "release_year": 2024, + "label": "Streamify Lab", + "artist_ids": ["sample-artist-1", "sample-artist-3"], + "artist_names": ["Nadia Vector", "The Lineage"], + "liked": False, + "source": "sample", + "ingested_at": ingested_at, + }, + ] + artists = [ + {"artist_id": "sample-artist-1", "artist_name": "Nadia Vector", "source": "sample", "ingested_at": ingested_at}, + {"artist_id": "sample-artist-2", "artist_name": "Duck DB Trio", "source": "sample", "ingested_at": ingested_at}, + {"artist_id": "sample-artist-3", "artist_name": "The Lineage", "source": "sample", "ingested_at": ingested_at}, + ] + albums = [ + {"album_id": "sample-album-1", "album_title": "Local Lake", "genre": "electronic", "release_year": 2024, "source": "sample", "ingested_at": ingested_at}, + {"album_id": "sample-album-2", "album_title": "Warehouse Sketches", "genre": "jazz", "release_year": 2023, "source": "sample", "ingested_at": ingested_at}, + ] + playlists = [ + {"playlist_id": "sample-playlist-1", "playlist_title": "Focus Rotation", "track_count": 2, "source": "sample", "ingested_at": ingested_at}, + {"playlist_id": "sample-playlist-2", "playlist_title": "Late Commits", "track_count": 2, "source": "sample", "ingested_at": ingested_at}, + ] + playlist_tracks = [ + {"playlist_id": "sample-playlist-1", "track_id": "sample-track-1", "position": 1, "source": "sample", "ingested_at": ingested_at}, + {"playlist_id": "sample-playlist-1", "track_id": "sample-track-2", "position": 2, "source": "sample", "ingested_at": ingested_at}, + {"playlist_id": "sample-playlist-2", "track_id": "sample-track-1", "position": 1, "source": "sample", "ingested_at": ingested_at}, + {"playlist_id": "sample-playlist-2", "track_id": "sample-track-3", "position": 2, "source": "sample", "ingested_at": ingested_at}, + ] + events = [ + {"event_id": "sample-like-1", "event_type": "liked_track", "track_id": "sample-track-1", "event_ts": winter, "source": "sample", "ingested_at": ingested_at}, + {"event_id": "sample-like-2", "event_type": "liked_track", "track_id": "sample-track-2", "event_ts": spring, "source": "sample", "ingested_at": ingested_at}, + {"event_id": "sample-playlist-1", "event_type": "playlist_membership", "track_id": "sample-track-1", "playlist_id": "sample-playlist-1", "event_ts": spring, "source": "sample", "ingested_at": ingested_at}, + {"event_id": "sample-playlist-2", "event_type": "playlist_membership", "track_id": "sample-track-3", "playlist_id": "sample-playlist-2", "event_ts": summer, "source": "sample", "ingested_at": ingested_at}, + ] + return { + "tracks": tracks, + "artists": artists, + "albums": albums, + "playlists": playlists, + "playlist_tracks": playlist_tracks, + "user_library_events": events, + } diff --git a/yamusic_ingest/yandex_client.py b/yamusic_ingest/yandex_client.py new file mode 100644 index 0000000..99a3dc3 --- /dev/null +++ b/yamusic_ingest/yandex_client.py @@ -0,0 +1,545 @@ +from __future__ import annotations + +from collections.abc import Callable +from dataclasses import dataclass +from datetime import datetime, timezone +from time import sleep +from typing import Any + +from yamusic_ingest import __version__ + + +class YandexMusicIngestError(RuntimeError): + """Raised for sanitized, user-facing Yandex Music ingestion failures.""" + + +@dataclass(frozen=True) +class IngestResult: + payload: dict[str, list[dict[str, Any]]] + diagnostics: dict[str, int] + + +def _sanitize_message(message: str, token: str | None = None) -> str: + sanitized = message + if token: + sanitized = sanitized.replace(token, "[redacted-token]") + return sanitized + + +def _call_with_retries(label: str, func: Callable[[], Any], *, attempts: int = 3, delay_seconds: float = 0.25) -> Any: + last_error: Exception | None = None + for attempt in range(1, attempts + 1): + try: + return func() + except Exception as exc: # noqa: BLE001 - external client exceptions vary by version. + last_error = exc + if attempt == attempts: + break + sleep(delay_seconds * attempt) + message = str(last_error) or last_error.__class__.__name__ if last_error else "unknown error" + raise YandexMusicIngestError(f"{label} failed after {attempts} attempts: {message}") from last_error + + +def _value(obj: Any, *names: str, default: Any = None) -> Any: + for name in names: + if isinstance(obj, dict) and name in obj: + return obj[name] + if hasattr(obj, name): + return getattr(obj, name) + return default + + +def _track_id(track: Any) -> str: + value = _value(track, "id", "track_id") + return str(value) + + +def _has_track_id(track: Any) -> bool: + value = _value(track, "id", "track_id", default=None) + return value is not None and str(value) != "" + + +def _album_id(album: Any) -> str: + value = _value(album, "id", "album_id") + return str(value) + + +def _has_album_id(album: Any) -> bool: + value = _value(album, "id", "album_id", default=None) + return value is not None and str(value) != "" + + +def _artist_id(artist: Any) -> str: + value = _value(artist, "id", "artist_id") + return str(value) + + +def _has_artist_id(artist: Any) -> bool: + value = _value(artist, "id", "artist_id", default=None) + return value is not None and str(value) != "" + + +def _album(track: Any) -> Any | None: + albums = _value(track, "albums", default=[]) or [] + return albums[0] if albums else None + + +def _artists(track: Any) -> list[Any]: + return list(_value(track, "artists", default=[]) or []) + + +def _duration_ms(track: Any) -> Any: + value = _value(track, "duration_ms", "durationMs", default=None) + if value is not None: + return value + duration = _value(track, "duration", default=None) + if isinstance(duration, (int, float)) and duration < 10_000: + return int(duration * 1000) + return duration + + +def _first_non_empty(values: list[Any]) -> Any: + for value in values: + if value not in {None, ""}: + return value + return None + + +def _first_artist_genre(artists: list[Any]) -> Any: + for artist in artists: + genres = _value(artist, "genres", default=None) or [] + if genres: + return genres[0] + return None + + +def _label(track: Any, album: Any | None, major: Any | None) -> Any: + if isinstance(major, str): + return major + if major: + value = _value(major, "name", default=None) + if value: + return value + labels = _value(album, "labels", default=[]) if album else [] + for label in labels or []: + if isinstance(label, str): + return label + value = _value(label, "name", default=None) + if value: + return value + return None + + +def _playlist_id(playlist: Any) -> str: + owner = _value(playlist, "owner", default=None) + uid = _first_non_empty([_value(playlist, "uid", default=None), _value(owner, "uid", "id", default=None)]) + kind = _value(playlist, "kind", default=None) + if uid not in {None, ""} and kind not in {None, ""}: + return f"{uid}:{kind}" + value = _first_non_empty([kind, uid, _value(playlist, "id", default=None), _value(playlist, "playlist_uuid", default=None)]) + return str(value or "") + + +def _normalize_playlist(playlist: Any, source: str, ingested_at: str) -> dict[str, Any]: + return { + "playlist_id": _playlist_id(playlist), + "playlist_title": _value(playlist, "title", default=""), + "track_count": _value(playlist, "track_count", "trackCount", default=None), + "source": source, + "ingested_at": ingested_at, + } + + +def _playlist_with_tracks(playlist: Any) -> tuple[Any, bool]: + if not hasattr(playlist, "fetch_tracks"): + return playlist, False + try: + return _call_with_retries("playlist.fetch_tracks", playlist.fetch_tracks), False + except YandexMusicIngestError: + return playlist, True + + +def _playlist_items(playlist_or_tracks: Any) -> list[Any]: + if isinstance(playlist_or_tracks, list): + return playlist_or_tracks + return list(_value(playlist_or_tracks, "tracks", default=[]) or []) + + +def _track_from_playlist_item(item: Any) -> tuple[Any, bool]: + embedded_track = _value(item, "track", default=None) + if embedded_track is not None: + return embedded_track, False + if hasattr(item, "fetch_track"): + try: + return _call_with_retries("playlist track fetch_track", item.fetch_track), False + except YandexMusicIngestError: + return item, True + return item, False + + +def _track_from_liked_shortcut(shortcut: Any) -> tuple[Any, bool]: + if hasattr(shortcut, "fetch_track"): + try: + return _call_with_retries("shortcut.fetch_track", shortcut.fetch_track), False + except YandexMusicIngestError: + embedded_track = _value(shortcut, "track", default=None) + return embedded_track if embedded_track is not None else shortcut, True + return shortcut, False + + +def _iso_timestamp(value: Any) -> str | None: + if value is None: + return None + if isinstance(value, datetime): + if value.tzinfo is None: + value = value.replace(tzinfo=timezone.utc) + return value.isoformat() + if isinstance(value, (int, float)): + seconds = value / 1000 if value > 10_000_000_000 else value + return datetime.fromtimestamp(seconds, tz=timezone.utc).isoformat() + return str(value) + + +def _available_timestamp(obj: Any) -> str | None: + value = _value( + obj, + "timestamp", + "created", + "created_at", + "createdAt", + "modified", + "modified_at", + "modifiedAt", + "recent", + "recent_timestamp", + default=None, + ) + return _iso_timestamp(value) + + +def _normalize_track(track: Any, source: str, ingested_at: str, liked: bool) -> dict[str, Any]: + album = _album(track) + artists = _artists(track) + major = _value(track, "major", default=None) + genre = _first_non_empty([ + _value(track, "genre", default=None), + _value(album, "genre", default=None), + _first_artist_genre(artists), + ]) + release_year = _first_non_empty([ + _value(track, "year", default=None), + _value(album, "year", default=None), + _value(album, "original_release_year", default=None), + ]) + return { + "track_id": _track_id(track), + "title": _value(track, "title", default=""), + "duration_ms": _duration_ms(track), + "album_id": str(_value(album, "id", default="")) if album else None, + "album_title": _value(album, "title", default=None) if album else None, + "genre": genre, + "release_year": release_year, + "label": _label(track, album, major), + "artist_ids": [str(_value(artist, "id", default="")) for artist in artists], + "artist_names": [_value(artist, "name", default="") for artist in artists], + "liked": liked, + "source": source, + "ingested_at": ingested_at, + } + + +def _normalize_album(album: Any, source: str, ingested_at: str) -> dict[str, Any]: + return { + "album_id": _album_id(album), + "album_title": _value(album, "title", default=None), + "genre": _value(album, "genre", default=None), + "release_year": _first_non_empty([ + _value(album, "year", default=None), + _value(album, "original_release_year", default=None), + ]), + "source": source, + "ingested_at": ingested_at, + } + + +def _normalize_artist(artist: Any, source: str, ingested_at: str) -> dict[str, Any]: + return { + "artist_id": _artist_id(artist), + "artist_name": _value(artist, "name", default=""), + "source": source, + "ingested_at": ingested_at, + } + + +def build_ingest_result_from_client(client: Any) -> IngestResult: + ingested_at = datetime.now(timezone.utc).isoformat() + diagnostics = { + "liked_shortcuts_seen": 0, + "liked_tracks_written": 0, + "liked_shortcuts_fetch_failed": 0, + "liked_shortcuts_missing_track_id": 0, + "liked_tracks_duplicate_skipped": 0, + "liked_albums_seen": 0, + "liked_albums_written": 0, + "liked_albums_missing_id": 0, + "liked_albums_duplicate_skipped": 0, + "liked_artists_seen": 0, + "liked_artists_written": 0, + "liked_artists_missing_id": 0, + "liked_artists_duplicate_skipped": 0, + "liked_playlists_seen": 0, + "liked_playlists_written": 0, + "liked_playlists_missing_id": 0, + "liked_playlists_duplicate_skipped": 0, + "playlists_seen": 0, + "playlists_written": 0, + "playlists_missing_id": 0, + "playlist_fetch_fallbacks": 0, + "playlist_tracks_seen": 0, + "playlist_tracks_written": 0, + "playlist_tracks_fetch_failed": 0, + "playlist_tracks_missing_track_id": 0, + "playlist_tracks_duplicate_skipped": 0, + } + + liked_tracks = [] + liked_track_ids: set[str] = set() + liked_tracks_response = _call_with_retries("client.users_likes_tracks", client.users_likes_tracks) + for shortcut in getattr(liked_tracks_response, "tracks", []) or []: + diagnostics["liked_shortcuts_seen"] += 1 + track, track_fetch_failed = _track_from_liked_shortcut(shortcut) + if track_fetch_failed: + diagnostics["liked_shortcuts_fetch_failed"] += 1 + if not _has_track_id(track): + diagnostics["liked_shortcuts_missing_track_id"] += 1 + continue + track_id = _track_id(track) + if track_id in liked_track_ids: + diagnostics["liked_tracks_duplicate_skipped"] += 1 + continue + liked_track_ids.add(track_id) + normalized = _normalize_track(track, "yandex_music", ingested_at, liked=True) + normalized["liked_at"] = _available_timestamp(shortcut) or ingested_at + liked_tracks.append(normalized) + diagnostics["liked_tracks_written"] += 1 + + playlists_by_id: dict[str, dict[str, Any]] = {} + playlist_tracks: list[dict[str, Any]] = [] + playlist_track_rows: dict[str, dict[str, Any]] = {} + playlist_track_memberships: set[tuple[str, str]] = set() + for playlist in _call_with_retries("client.users_playlists_list", client.users_playlists_list) or []: + diagnostics["playlists_seen"] += 1 + playlist_id = _playlist_id(playlist) + if not playlist_id: + diagnostics["playlists_missing_id"] += 1 + continue + playlists_by_id[playlist_id] = _normalize_playlist(playlist, "yandex_music", ingested_at) + diagnostics["playlists_written"] += 1 + full_playlist, used_fallback = _playlist_with_tracks(playlist) + if used_fallback: + diagnostics["playlist_fetch_fallbacks"] += 1 + for position, item in enumerate(_playlist_items(full_playlist), start=1): + diagnostics["playlist_tracks_seen"] += 1 + track, track_fetch_failed = _track_from_playlist_item(item) + if track_fetch_failed: + diagnostics["playlist_tracks_fetch_failed"] += 1 + if not _has_track_id(track): + diagnostics["playlist_tracks_missing_track_id"] += 1 + continue + normalized = _normalize_track(track, "yandex_music", ingested_at, liked=False) + membership_key = (playlist_id, normalized["track_id"]) + if membership_key in playlist_track_memberships: + diagnostics["playlist_tracks_duplicate_skipped"] += 1 + continue + playlist_track_memberships.add(membership_key) + playlist_track_rows[normalized["track_id"]] = normalized + playlist_tracks.append( + { + "playlist_id": playlist_id, + "track_id": normalized["track_id"], + "position": position, + "added_at": _available_timestamp(item), + "source": "yandex_music", + "ingested_at": ingested_at, + } + ) + diagnostics["playlist_tracks_written"] += 1 + + tracks_by_id = {row["track_id"]: row for row in playlist_track_rows.values()} + for row in liked_tracks: + tracks_by_id[row["track_id"]] = row + + artists: dict[str, dict[str, Any]] = {} + albums: dict[str, dict[str, Any]] = {} + for row in tracks_by_id.values(): + if row.get("album_id"): + albums[row["album_id"]] = { + "album_id": row["album_id"], + "album_title": row.get("album_title"), + "genre": row.get("genre"), + "release_year": row.get("release_year"), + "source": "yandex_music", + "ingested_at": ingested_at, + } + for artist_id, artist_name in zip(row.get("artist_ids") or [], row.get("artist_names") or []): + if artist_id: + artists[artist_id] = { + "artist_id": artist_id, + "artist_name": artist_name, + "source": "yandex_music", + "ingested_at": ingested_at, + } + + liked_albums_response = _call_with_retries("client.users_likes_albums", client.users_likes_albums) if hasattr(client, "users_likes_albums") else [] + for like in liked_albums_response or []: + diagnostics["liked_albums_seen"] += 1 + album = _value(like, "album", default=like) + if not _has_album_id(album): + diagnostics["liked_albums_missing_id"] += 1 + continue + album_id = _album_id(album) + if album_id in albums: + diagnostics["liked_albums_duplicate_skipped"] += 1 + continue + albums[album_id] = _normalize_album(album, "yandex_music", ingested_at) + diagnostics["liked_albums_written"] += 1 + + liked_artists_response = _call_with_retries("client.users_likes_artists", client.users_likes_artists) if hasattr(client, "users_likes_artists") else [] + for like in liked_artists_response or []: + diagnostics["liked_artists_seen"] += 1 + artist = _value(like, "artist", default=like) + if not _has_artist_id(artist): + diagnostics["liked_artists_missing_id"] += 1 + continue + artist_id = _artist_id(artist) + if artist_id in artists: + diagnostics["liked_artists_duplicate_skipped"] += 1 + continue + artists[artist_id] = _normalize_artist(artist, "yandex_music", ingested_at) + diagnostics["liked_artists_written"] += 1 + + liked_playlists_response = _call_with_retries("client.users_likes_playlists", client.users_likes_playlists) if hasattr(client, "users_likes_playlists") else [] + for like in liked_playlists_response or []: + diagnostics["liked_playlists_seen"] += 1 + playlist = _value(like, "playlist", default=like) + playlist_id = _playlist_id(playlist) + if not playlist_id: + diagnostics["liked_playlists_missing_id"] += 1 + continue + if playlist_id in playlists_by_id: + diagnostics["liked_playlists_duplicate_skipped"] += 1 + continue + playlists_by_id[playlist_id] = _normalize_playlist(playlist, "yandex_music", ingested_at) + diagnostics["liked_playlists_written"] += 1 + + events = [ + { + "event_id": f"liked_track:{row['track_id']}", + "event_type": "liked_track", + "track_id": row["track_id"], + "event_ts": row.get("liked_at") or ingested_at, + "source": "yandex_music", + "ingested_at": ingested_at, + } + for row in liked_tracks + ] + events.extend( + { + "event_id": f"playlist_membership:{row['playlist_id']}:{row['track_id']}", + "event_type": "playlist_membership", + "track_id": row["track_id"], + "playlist_id": row["playlist_id"], + "event_ts": row.get("added_at") or ingested_at, + "source": "yandex_music", + "ingested_at": ingested_at, + } + for row in playlist_tracks + ) + + return IngestResult( + payload={ + "tracks": list(tracks_by_id.values()), + "artists": list(artists.values()), + "albums": list(albums.values()), + "playlists": list(playlists_by_id.values()), + "playlist_tracks": playlist_tracks, + "user_library_events": events, + }, + diagnostics=diagnostics, + ) + + +def build_payload_from_client(client: Any) -> dict[str, list[dict[str, Any]]]: + return build_ingest_result_from_client(client).payload + + +def _client_class() -> Any: + try: + from yandex_music import Client + except ImportError as exc: + raise YandexMusicIngestError("Install yandex-music to ingest a real account: pip install yandex-music") from exc + + return Client + + +def _client_version() -> str | None: + try: + import yandex_music + except ImportError: + return None + return getattr(yandex_music, "__version__", None) + + +def client_metadata() -> dict[str, str | None]: + return { + "adapter_name": "yamusic_ingest", + "adapter_version": __version__, + "client_library": "yandex-music", + "client_library_version": _client_version(), + } + + +def client_from_token(token: str) -> Any: + try: + return _call_with_retries("Yandex Music client init", lambda: _client_class()(token).init()) + except Exception as exc: + raise YandexMusicIngestError(_sanitize_message(str(exc) or exc.__class__.__name__, token)) from exc + + +def preflight_client(client: Any) -> dict[str, Any]: + liked_tracks_response = _call_with_retries("client.users_likes_tracks", client.users_likes_tracks) + liked_shortcuts = getattr(liked_tracks_response, "tracks", []) or [] + liked_albums = _call_with_retries("client.users_likes_albums", client.users_likes_albums) if hasattr(client, "users_likes_albums") else [] + liked_artists = _call_with_retries("client.users_likes_artists", client.users_likes_artists) if hasattr(client, "users_likes_artists") else [] + liked_playlists = _call_with_retries("client.users_likes_playlists", client.users_likes_playlists) if hasattr(client, "users_likes_playlists") else [] + playlists = _call_with_retries("client.users_playlists_list", client.users_playlists_list) or [] + return { + "source": "yandex_music", + "status": "ok", + "liked_shortcut_count": len(liked_shortcuts), + "liked_album_count": len(liked_albums or []), + "liked_artist_count": len(liked_artists or []), + "liked_playlist_count": len(liked_playlists or []), + "playlist_count": len(playlists), + **client_metadata(), + } + + +def preflight_token(token: str) -> dict[str, Any]: + client = client_from_token(token) + try: + return preflight_client(client) + except Exception as exc: # noqa: BLE001 - external client exceptions vary by version. + raise YandexMusicIngestError(_sanitize_message(str(exc) or exc.__class__.__name__, token)) from exc + + +def fetch_payload(token: str) -> dict[str, list[dict[str, Any]]]: + return fetch_ingest_result(token).payload + + +def fetch_ingest_result(token: str) -> IngestResult: + client = client_from_token(token) + try: + return build_ingest_result_from_client(client) + except Exception as exc: # noqa: BLE001 - external client exceptions vary by version. + raise YandexMusicIngestError(_sanitize_message(str(exc) or exc.__class__.__name__, token)) from exc