thunderbit-operations · jackguam342-cmd · Jun 3, 2026
diff --git a/README.md b/README.md
@@ -7,6 +7,7 @@ Public research datasets, scripts, charts, and publishing assets from Thunderbit
 | Project | Description |
 |---|---|
 | [`us-realtor-youtube-atlas-2026`](us-realtor-youtube-atlas-2026/) | Analysis of 179 US real-estate-adjacent YouTube channels and 3,839 recent videos, focused on what real estate video formats actually get views |
+| [`us-dtc-youtube-atlas-2026`](us-dtc-youtube-atlas-2026/) | Analysis of 277 US-market DTC-adjacent YouTube channels and 5,695 recent videos, focused on which DTC content types actually get views in 2024-2026 |
 
 ## Notes
 

diff --git a/us-dtc-youtube-atlas-2026/.gitignore b/us-dtc-youtube-atlas-2026/.gitignore
@@ -0,0 +1,8 @@
+__pycache__/
+*.pyc
+.DS_Store
+.env
+.venv/
+venv/
+.youtube_api_key
+raw/
diff --git a/us-dtc-youtube-atlas-2026/00_build_channel_pool.py b/us-dtc-youtube-atlas-2026/00_build_channel_pool.py
@@ -0,0 +1,306 @@
+"""
+00_build_channel_pool.py — 构建 200+ 美国 DTC / Ecom YouTube 频道池
+
+策略:
+  1. 种子 list:35+ 公认 DTC operator / educator / founder YouTube channels
+     聚焦"教 DTC 怎么做"的 creator-operator,不收纯品牌官方账号
+  2. YouTube search.list 扩展:DTC / Shopify / FB ads / Klaviyo 等关键词
+  3. 过滤:country=US (或空) + channel description / title 含 ecom 关键词
+  4. 去重,输出 channel pool
+
+输出: out/channel_pool.csv
+"""
+
+from __future__ import annotations
+import csv
+import json
+import os
+import sys
+import time
+import urllib.parse
+import urllib.request
+from collections import OrderedDict
+from pathlib import Path
+
+ROOT = Path(__file__).resolve().parent
+OUT = ROOT / "out"
+RAW = ROOT / "raw"
+OUT.mkdir(parents=True, exist_ok=True)
+RAW.mkdir(parents=True, exist_ok=True)
+
+API_KEY = open(os.path.expanduser("~/.youtube_api_key")).read().strip()
+API = "https://www.googleapis.com/youtube/v3"
+TIMEOUT = 20
+
+
+def api_get(endpoint: str, params: dict) -> dict:
+    params = {**params, "key": API_KEY}
+    qs = urllib.parse.urlencode(params)
+    url = f"{API}/{endpoint}?{qs}"
+    try:
+        with urllib.request.urlopen(url, timeout=TIMEOUT) as r:
+            return json.loads(r.read())
+    except urllib.error.HTTPError as e:
+        body = e.read().decode("utf-8", errors="ignore")[:300]
+        print(f"  [HTTP {e.code}] {endpoint}: {body}", file=sys.stderr)
+        return {}
+    except Exception as e:
+        print(f"  [ERR] {endpoint}: {e}", file=sys.stderr)
+        return {}
+
+
+# ============================================================
+# 1. 种子 channel list — DTC operator-educator-founder
+# ============================================================
+
+SEED_HANDLES = [
+    # Tier A — operator-educators who built/scaled DTC brands
+    "@daviefogarty",            # Davie Fogarty / The Oodie
+    "@AlexHormozi",             # Acquisition.com (DTC adjacent, huge)
+    "@beardbrand",              # Eric Bandholz / Beardbrand founder
+    "@ezraf",                   # Ezra Firestone / Smart Marketer / BOOM
+    "@gregisenberg",            # Greg Isenberg (community + DTC trends)
+    "@daviedavid",              # alt slug for Davie Fogarty content
+    "@MikeBeckham",             # Simple Modern founder
+    "@niksharma",               # Nik Sharma / Sharma Brands
+
+    # Tier B — DTC operators + agency leads (educational)
+    "@CommonThreadCo",          # Common Thread Collective / Taylor Holiday
+    "@MyFirstMillionPod",       # MFM Pod (Sam Parr / Shaan Puri)
+    "@TheHustleDaily",          # The Hustle
+    "@Foundr",                  # Foundr Magazine
+    "@WholesaleTed",            # Sarah Chrisp
+    "@SteveTanOfficial",        # Steve Tan (Asia + US DTC)
+    "@StevenBartlett",          # Diary of a CEO (UK but US audience)
+
+    # Tier C — Ads & marketing for DTC
+    "@BenHeath",                # Ben Heath / FB ads
+    "@TheCharlieChang",         # Charlie Chang / FB ads
+    "@AaronFletcherOfficial",   # Aaron Fletcher
+    "@hayden_bowles",           # Hayden Bowles / dropship + DTC
+    "@neilpatel",               # SEO + DTC marketing
+    "@amyporterfield",          # email marketing
+    "@RyanDeiss",               # DigitalMarketer
+
+    # Tier D — Shopify ecosystem
+    "@Shopify",                 # official Shopify channel
+    "@ShopifyMasters",          # Shopify Masters podcast
+
+    # Tier E — Brand / creative for DTC
+    "@PhilipVanDusen",          # branding for entrepreneurs
+    "@TheFutur",                # design + biz for creatives
+
+    # Tier F — Newsletter / Pod operators (DTC overlap)
+    "@SamParr",                 # Hampton / ex-Hustle
+    "@ColinAndSamir",           # creator economy + DTC
+
+    # Tier G — Founder vlog / brand founder content
+    "@noahkagan",               # AppSumo
+    "@AppSumo",                 # AppSumo official
+    "@gymshark",                # Gymshark (brand but Ben Francis publishes)
+
+    # Tier H — Adjacents (entrepreneurship, often DTC stories)
+    "@DanMartell",              # SaaS but DTC overlap
+    "@SimonSquibb",             # entrepreneur education
+    "@PatFlynn",                # passive income, DTC adjacent
+    "@TheRealDanLok",           # business education
+]
+
+
+SEED_AUX = [
+    # Less-certain handles — search will catch them if these miss
+    "@SunnyLenarduzzi",          # YouTube + DTC growth
+    "@CalebMaddix",              # younger entrepreneur DTC
+    "@TheStartupShow",           # startup interviews
+    "@Bedros",                   # entrepreneurship
+    "@nickysilversocial",        # social media + DTC
+]
+
+
+def resolve_handle(handle: str) -> str | None:
+    """@handle → channelId via channels.list with forHandle param."""
+    h = handle.lstrip("@")
+    data = api_get("channels", {
+        "part": "id",
+        "forHandle": h,
+    })
+    items = data.get("items", [])
+    if items:
+        return items[0]["id"]
+    return None
+
+
+# ============================================================
+# 2. Search.list 扩展
+# ============================================================
+
+SEARCH_QUERIES = [
+    "dtc brand",
+    "ecommerce business",
+    "shopify dropshipping",
+    "facebook ads for ecommerce",
+    "klaviyo email marketing",
+    "build a dtc brand",
+    "ecommerce entrepreneur",
+    "shopify for beginners",
+    "how to scale ecommerce",
+    "shopify success story",
+    "ecommerce ads tutorial",
+    "how i built my brand",
+    "tiktok shop seller",
+    "amazon to shopify",
+]
+
+
+def search_channels(query: str, max_results: int = 30) -> list[dict]:
+    """search.list type=channel,返回 channel snippets。"""
+    data = api_get("search", {
+        "part": "snippet",
+        "type": "channel",
+        "q": query,
+        "regionCode": "US",
+        "relevanceLanguage": "en",
+        "maxResults": min(max_results, 50),
+    })
+    return data.get("items", [])
+
+
+# ============================================================
+# Main
+# ============================================================
+
+def main():
+    # Step 1: resolve seed handles
+    pool: OrderedDict[str, dict] = OrderedDict()
+    print("Step 1: resolve seed handles...")
+    resolved, missed = 0, 0
+    for handle in SEED_HANDLES + SEED_AUX:
+        cid = resolve_handle(handle)
+        if cid:
+            pool[cid] = {
+                "channel_id": cid,
+                "title": "",
+                "handle": handle,
+                "description": "",
+                "country": "",
+                "source": "seed",
+                "seed_query": "",
+            }
+            print(f"  ✅ {handle} → {cid}")
+            resolved += 1
+        else:
+            print(f"  ⚠️  {handle} → not found")
+            missed += 1
+        time.sleep(0.1)
+    print(f"\nStep 1: {resolved} resolved, {missed} missed (search will backfill)\n")
+
+    # Step 2: search.list expand
+    print("Step 2: search.list expansion...")
+    for q in SEARCH_QUERIES:
+        items = search_channels(q, max_results=30)
+        new_count = 0
+        for it in items:
+            cid = it["snippet"]["channelId"]
+            if cid in pool:
+                continue
+            pool[cid] = {
+                "channel_id": cid,
+                "title": it["snippet"]["title"],
+                "handle": "",
+                "description": it["snippet"]["description"][:200],
+                "country": "",
+                "source": "search",
+                "seed_query": q,
+            }
+            new_count += 1
+        print(f"  [{q!r}] 新增 {new_count} 个 channel")
+        time.sleep(0.5)
+
+    print(f"\nStep 2 后总 {len(pool)} 个 channel\n")
+
+    # Step 3: 拿 channels.list 详细信息(country, viewCount, subscriberCount)
+    print("Step 3: 拉 channel statistics...")
+    channel_ids = list(pool.keys())
+    for batch_start in range(0, len(channel_ids), 50):
+        batch = channel_ids[batch_start:batch_start + 50]
+        data = api_get("channels", {
+            "part": "snippet,statistics,brandingSettings,topicDetails,contentDetails",
+            "id": ",".join(batch),
+        })
+        for item in data.get("items", []):
+            cid = item["id"]
+            if cid not in pool:
+                continue
+            sn = item["snippet"]
+            st = item["statistics"]
+            br = item.get("brandingSettings", {}).get("channel", {})
+            td = item.get("topicDetails", {}).get("topicCategories", [])
+            pool[cid].update({
+                "title": sn.get("title") or pool[cid]["title"],
+                "description": sn.get("description", "")[:300],
+                "country": sn.get("country", "") or br.get("country", ""),
+                "published_at": sn.get("publishedAt", "")[:10],
+                "subscriber_count": st.get("subscriberCount", "0"),
+                "view_count": st.get("viewCount", "0"),
+                "video_count": st.get("videoCount", "0"),
+                "topic_categories": ";".join(td)[:300],
+                "default_language": sn.get("defaultLanguage", ""),
+                "uploads_playlist_id": item.get("contentDetails", {}).get("relatedPlaylists", {}).get("uploads", ""),
+            })
+        time.sleep(0.3)
+    print(f"已拉 {len(pool)} 个 channel 的 metadata\n")
+
+    # Step 4: 过滤 — 留 country=US/empty + description/title 含 DTC/ecom 关键词
+    DTC_KEYWORDS = [
+        "ecommerce", "e-commerce", "dtc", "direct-to-consumer", "direct to consumer",
+        "shopify", "klaviyo", "online store", "online business",
+        "brand", "founder", "drop shipping", "dropshipping", "drop-shipping",
+        "sell online", "ads", "marketing", "amazon seller", "fba",
+        "7-figure", "8-figure", "7 figure", "8 figure", "six figure", "seven figure",
+        "scale", "scaling", "entrepreneur", "startup",
+    ]
+
+    filtered: list[dict] = []
+    for cid, d in pool.items():
+        title = (d.get("title") or "").lower()
+        desc = (d.get("description") or "").lower()
+        country = (d.get("country") or "").upper()
+        text = title + " " + desc
+
+        # US-centric: prefer US; allow GB/CA/AU/empty (DTC creators often borderless)
+        country_ok = country in {"US", "GB", "CA", "AU", ""}
+        has_keyword = any(kw in text for kw in DTC_KEYWORDS)
+        if country_ok and has_keyword:
+            filtered.append(d)
+        elif d["source"] == "seed":
+            # seed channels 即使不严格 match 也保留
+            filtered.append(d)
+
+    print(f"过滤后:{len(filtered)} channels(US/GB/CA/AU 优先 + DTC 关键词)\n")
+
+    # Step 5: 按 subscriber_count 排序
+    filtered.sort(key=lambda d: int(d.get("subscriber_count", "0") or "0"), reverse=True)
+
+    # 输出 CSV
+    csv_path = OUT / "channel_pool.csv"
+    fields = [
+        "channel_id", "title", "handle", "country", "subscriber_count", "view_count",
+        "video_count", "published_at", "uploads_playlist_id",
+        "topic_categories", "default_language", "source", "seed_query", "description",
+    ]
+    with csv_path.open("w", newline="", encoding="utf-8") as fp:
+        w = csv.DictWriter(fp, fieldnames=fields, extrasaction="ignore")
+        w.writeheader()
+        for d in filtered:
+            w.writerow(d)
+    print(f"✅ {csv_path}")
+
+    # Top 20 by subscriber count
+    print(f"\nTop 20 channels by subscriber count:")
+    for i, d in enumerate(filtered[:20], 1):
+        sub = int(d.get("subscriber_count", "0") or "0")
+        print(f"  {i:>2}. {d['title']:<35} {sub:>10,} subs  [{d['country'] or '?'}]  ({d['source']})")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/us-dtc-youtube-atlas-2026/00b_patch_uploads_id.py b/us-dtc-youtube-atlas-2026/00b_patch_uploads_id.py
@@ -0,0 +1,57 @@
+"""00b_patch_uploads_id.py — 补抓 uploads_playlist_id 字段(00 漏了 contentDetails part)"""
+import csv, json, os, time, urllib.parse, urllib.request, sys
+from pathlib import Path
+
+ROOT = Path(__file__).resolve().parent
+OUT = ROOT / "out"
+API_KEY = open(os.path.expanduser("~/.youtube_api_key")).read().strip()
+API = "https://www.googleapis.com/youtube/v3"
+
+
+def api_get(endpoint, params):
+    params = {**params, "key": API_KEY}
+    qs = urllib.parse.urlencode(params)
+    try:
+        with urllib.request.urlopen(f"{API}/{endpoint}?{qs}", timeout=25) as r:
+            return json.loads(r.read())
+    except Exception as e:
+        print(f"  [ERR] {e}", file=sys.stderr)
+        return {}
+
+
+def main():
+    csv_path = OUT / "channel_pool.csv"
+    rows = list(csv.DictReader(csv_path.open(encoding="utf-8")))
+    print(f"补抓 {len(rows)} 个 channel 的 uploads_playlist_id...")
+
+    ids = [r["channel_id"] for r in rows]
+    upl_map: dict[str, str] = {}
+    for i in range(0, len(ids), 50):
+        batch = ids[i:i + 50]
+        data = api_get("channels", {"part": "contentDetails", "id": ",".join(batch)})
+        for item in data.get("items", []):
+            cid = item["id"]
+            upl = item.get("contentDetails", {}).get("relatedPlaylists", {}).get("uploads", "")
+            upl_map[cid] = upl
+        time.sleep(0.2)
+
+    # 更新 row
+    fixed = 0
+    for r in rows:
+        upl = upl_map.get(r["channel_id"], "")
+        if upl:
+            r["uploads_playlist_id"] = upl
+            fixed += 1
+    print(f"已补 {fixed}/{len(rows)} 条 uploads_playlist_id")
+
+    # 写回
+    with csv_path.open("w", newline="", encoding="utf-8") as fp:
+        w = csv.DictWriter(fp, fieldnames=list(rows[0].keys()))
+        w.writeheader()
+        for r in rows:
+            w.writerow(r)
+    print(f"✅ 已更新 {csv_path}")
+
+
+if __name__ == "__main__":
+    main()