Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ Public research datasets, scripts, charts, and publishing assets from Thunderbit
| Project | Description |
|---|---|
| [`us-realtor-youtube-atlas-2026`](us-realtor-youtube-atlas-2026/) | Analysis of 179 US real-estate-adjacent YouTube channels and 3,839 recent videos, focused on what real estate video formats actually get views |
| [`us-dtc-youtube-atlas-2026`](us-dtc-youtube-atlas-2026/) | Analysis of 277 US-market DTC-adjacent YouTube channels and 5,695 recent videos, focused on which DTC content types actually get views in 2024-2026 |

## Notes

Expand Down
8 changes: 8 additions & 0 deletions us-dtc-youtube-atlas-2026/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
__pycache__/
*.pyc
.DS_Store
.env
.venv/
venv/
.youtube_api_key
raw/
306 changes: 306 additions & 0 deletions us-dtc-youtube-atlas-2026/00_build_channel_pool.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,306 @@
"""
00_build_channel_pool.py — 构建 200+ 美国 DTC / Ecom YouTube 频道池

策略:
1. 种子 list:35+ 公认 DTC operator / educator / founder YouTube channels
聚焦"教 DTC 怎么做"的 creator-operator,不收纯品牌官方账号
2. YouTube search.list 扩展:DTC / Shopify / FB ads / Klaviyo 等关键词
3. 过滤:country=US (或空) + channel description / title 含 ecom 关键词
4. 去重,输出 channel pool

输出: out/channel_pool.csv
"""

from __future__ import annotations
import csv
import json
import os
import sys
import time
import urllib.parse
import urllib.request
from collections import OrderedDict
from pathlib import Path

ROOT = Path(__file__).resolve().parent
OUT = ROOT / "out"
RAW = ROOT / "raw"
OUT.mkdir(parents=True, exist_ok=True)
RAW.mkdir(parents=True, exist_ok=True)

API_KEY = open(os.path.expanduser("~/.youtube_api_key")).read().strip()
API = "https://www.googleapis.com/youtube/v3"
TIMEOUT = 20


def api_get(endpoint: str, params: dict) -> dict:
params = {**params, "key": API_KEY}
qs = urllib.parse.urlencode(params)
url = f"{API}/{endpoint}?{qs}"
try:
with urllib.request.urlopen(url, timeout=TIMEOUT) as r:
return json.loads(r.read())
except urllib.error.HTTPError as e:
body = e.read().decode("utf-8", errors="ignore")[:300]
print(f" [HTTP {e.code}] {endpoint}: {body}", file=sys.stderr)
return {}
except Exception as e:
print(f" [ERR] {endpoint}: {e}", file=sys.stderr)
return {}


# ============================================================
# 1. 种子 channel list — DTC operator-educator-founder
# ============================================================

SEED_HANDLES = [
# Tier A — operator-educators who built/scaled DTC brands
"@daviefogarty", # Davie Fogarty / The Oodie
"@AlexHormozi", # Acquisition.com (DTC adjacent, huge)
"@beardbrand", # Eric Bandholz / Beardbrand founder
"@ezraf", # Ezra Firestone / Smart Marketer / BOOM
"@gregisenberg", # Greg Isenberg (community + DTC trends)
"@daviedavid", # alt slug for Davie Fogarty content
"@MikeBeckham", # Simple Modern founder
"@niksharma", # Nik Sharma / Sharma Brands

# Tier B — DTC operators + agency leads (educational)
"@CommonThreadCo", # Common Thread Collective / Taylor Holiday
"@MyFirstMillionPod", # MFM Pod (Sam Parr / Shaan Puri)
"@TheHustleDaily", # The Hustle
"@Foundr", # Foundr Magazine
"@WholesaleTed", # Sarah Chrisp
"@SteveTanOfficial", # Steve Tan (Asia + US DTC)
"@StevenBartlett", # Diary of a CEO (UK but US audience)

# Tier C — Ads & marketing for DTC
"@BenHeath", # Ben Heath / FB ads
"@TheCharlieChang", # Charlie Chang / FB ads
"@AaronFletcherOfficial", # Aaron Fletcher
"@hayden_bowles", # Hayden Bowles / dropship + DTC
"@neilpatel", # SEO + DTC marketing
"@amyporterfield", # email marketing
"@RyanDeiss", # DigitalMarketer

# Tier D — Shopify ecosystem
"@Shopify", # official Shopify channel
"@ShopifyMasters", # Shopify Masters podcast

# Tier E — Brand / creative for DTC
"@PhilipVanDusen", # branding for entrepreneurs
"@TheFutur", # design + biz for creatives

# Tier F — Newsletter / Pod operators (DTC overlap)
"@SamParr", # Hampton / ex-Hustle
"@ColinAndSamir", # creator economy + DTC

# Tier G — Founder vlog / brand founder content
"@noahkagan", # AppSumo
"@AppSumo", # AppSumo official
"@gymshark", # Gymshark (brand but Ben Francis publishes)

# Tier H — Adjacents (entrepreneurship, often DTC stories)
"@DanMartell", # SaaS but DTC overlap
"@SimonSquibb", # entrepreneur education
"@PatFlynn", # passive income, DTC adjacent
"@TheRealDanLok", # business education
]


SEED_AUX = [
# Less-certain handles — search will catch them if these miss
"@SunnyLenarduzzi", # YouTube + DTC growth
"@CalebMaddix", # younger entrepreneur DTC
"@TheStartupShow", # startup interviews
"@Bedros", # entrepreneurship
"@nickysilversocial", # social media + DTC
]


def resolve_handle(handle: str) -> str | None:
"""@handle → channelId via channels.list with forHandle param."""
h = handle.lstrip("@")
data = api_get("channels", {
"part": "id",
"forHandle": h,
})
items = data.get("items", [])
if items:
return items[0]["id"]
return None


# ============================================================
# 2. Search.list 扩展
# ============================================================

SEARCH_QUERIES = [
"dtc brand",
"ecommerce business",
"shopify dropshipping",
"facebook ads for ecommerce",
"klaviyo email marketing",
"build a dtc brand",
"ecommerce entrepreneur",
"shopify for beginners",
"how to scale ecommerce",
"shopify success story",
"ecommerce ads tutorial",
"how i built my brand",
"tiktok shop seller",
"amazon to shopify",
]


def search_channels(query: str, max_results: int = 30) -> list[dict]:
"""search.list type=channel,返回 channel snippets。"""
data = api_get("search", {
"part": "snippet",
"type": "channel",
"q": query,
"regionCode": "US",
"relevanceLanguage": "en",
"maxResults": min(max_results, 50),
})
return data.get("items", [])


# ============================================================
# Main
# ============================================================

def main():
# Step 1: resolve seed handles
pool: OrderedDict[str, dict] = OrderedDict()
print("Step 1: resolve seed handles...")
resolved, missed = 0, 0
for handle in SEED_HANDLES + SEED_AUX:
cid = resolve_handle(handle)
if cid:
pool[cid] = {
"channel_id": cid,
"title": "",
"handle": handle,
"description": "",
"country": "",
"source": "seed",
"seed_query": "",
}
print(f" ✅ {handle} → {cid}")
resolved += 1
else:
print(f" ⚠️ {handle} → not found")
missed += 1
time.sleep(0.1)
print(f"\nStep 1: {resolved} resolved, {missed} missed (search will backfill)\n")

# Step 2: search.list expand
print("Step 2: search.list expansion...")
for q in SEARCH_QUERIES:
items = search_channels(q, max_results=30)
new_count = 0
for it in items:
cid = it["snippet"]["channelId"]
if cid in pool:
continue
pool[cid] = {
"channel_id": cid,
"title": it["snippet"]["title"],
"handle": "",
"description": it["snippet"]["description"][:200],
"country": "",
"source": "search",
"seed_query": q,
}
new_count += 1
print(f" [{q!r}] 新增 {new_count} 个 channel")
time.sleep(0.5)

print(f"\nStep 2 后总 {len(pool)} 个 channel\n")

# Step 3: 拿 channels.list 详细信息(country, viewCount, subscriberCount)
print("Step 3: 拉 channel statistics...")
channel_ids = list(pool.keys())
for batch_start in range(0, len(channel_ids), 50):
batch = channel_ids[batch_start:batch_start + 50]
data = api_get("channels", {
"part": "snippet,statistics,brandingSettings,topicDetails,contentDetails",
"id": ",".join(batch),
})
for item in data.get("items", []):
cid = item["id"]
if cid not in pool:
continue
sn = item["snippet"]
st = item["statistics"]
br = item.get("brandingSettings", {}).get("channel", {})
td = item.get("topicDetails", {}).get("topicCategories", [])
pool[cid].update({
"title": sn.get("title") or pool[cid]["title"],
"description": sn.get("description", "")[:300],
"country": sn.get("country", "") or br.get("country", ""),
"published_at": sn.get("publishedAt", "")[:10],
"subscriber_count": st.get("subscriberCount", "0"),
"view_count": st.get("viewCount", "0"),
"video_count": st.get("videoCount", "0"),
"topic_categories": ";".join(td)[:300],
"default_language": sn.get("defaultLanguage", ""),
"uploads_playlist_id": item.get("contentDetails", {}).get("relatedPlaylists", {}).get("uploads", ""),
})
time.sleep(0.3)
print(f"已拉 {len(pool)} 个 channel 的 metadata\n")

# Step 4: 过滤 — 留 country=US/empty + description/title 含 DTC/ecom 关键词
DTC_KEYWORDS = [
"ecommerce", "e-commerce", "dtc", "direct-to-consumer", "direct to consumer",
"shopify", "klaviyo", "online store", "online business",
"brand", "founder", "drop shipping", "dropshipping", "drop-shipping",
"sell online", "ads", "marketing", "amazon seller", "fba",
"7-figure", "8-figure", "7 figure", "8 figure", "six figure", "seven figure",
"scale", "scaling", "entrepreneur", "startup",
]

filtered: list[dict] = []
for cid, d in pool.items():
title = (d.get("title") or "").lower()
desc = (d.get("description") or "").lower()
country = (d.get("country") or "").upper()
text = title + " " + desc

# US-centric: prefer US; allow GB/CA/AU/empty (DTC creators often borderless)
country_ok = country in {"US", "GB", "CA", "AU", ""}
has_keyword = any(kw in text for kw in DTC_KEYWORDS)
if country_ok and has_keyword:
filtered.append(d)
elif d["source"] == "seed":
# seed channels 即使不严格 match 也保留
filtered.append(d)

print(f"过滤后:{len(filtered)} channels(US/GB/CA/AU 优先 + DTC 关键词)\n")

# Step 5: 按 subscriber_count 排序
filtered.sort(key=lambda d: int(d.get("subscriber_count", "0") or "0"), reverse=True)

# 输出 CSV
csv_path = OUT / "channel_pool.csv"
fields = [
"channel_id", "title", "handle", "country", "subscriber_count", "view_count",
"video_count", "published_at", "uploads_playlist_id",
"topic_categories", "default_language", "source", "seed_query", "description",
]
with csv_path.open("w", newline="", encoding="utf-8") as fp:
w = csv.DictWriter(fp, fieldnames=fields, extrasaction="ignore")
w.writeheader()
for d in filtered:
w.writerow(d)
print(f"✅ {csv_path}")

# Top 20 by subscriber count
print(f"\nTop 20 channels by subscriber count:")
for i, d in enumerate(filtered[:20], 1):
sub = int(d.get("subscriber_count", "0") or "0")
print(f" {i:>2}. {d['title']:<35} {sub:>10,} subs [{d['country'] or '?'}] ({d['source']})")


if __name__ == "__main__":
main()
57 changes: 57 additions & 0 deletions us-dtc-youtube-atlas-2026/00b_patch_uploads_id.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
"""00b_patch_uploads_id.py — 补抓 uploads_playlist_id 字段(00 漏了 contentDetails part)"""
import csv, json, os, time, urllib.parse, urllib.request, sys
from pathlib import Path

ROOT = Path(__file__).resolve().parent
OUT = ROOT / "out"
API_KEY = open(os.path.expanduser("~/.youtube_api_key")).read().strip()
API = "https://www.googleapis.com/youtube/v3"


def api_get(endpoint, params):
params = {**params, "key": API_KEY}
qs = urllib.parse.urlencode(params)
try:
with urllib.request.urlopen(f"{API}/{endpoint}?{qs}", timeout=25) as r:
return json.loads(r.read())
except Exception as e:
print(f" [ERR] {e}", file=sys.stderr)
return {}


def main():
csv_path = OUT / "channel_pool.csv"
rows = list(csv.DictReader(csv_path.open(encoding="utf-8")))
print(f"补抓 {len(rows)} 个 channel 的 uploads_playlist_id...")

ids = [r["channel_id"] for r in rows]
upl_map: dict[str, str] = {}
for i in range(0, len(ids), 50):
batch = ids[i:i + 50]
data = api_get("channels", {"part": "contentDetails", "id": ",".join(batch)})
for item in data.get("items", []):
cid = item["id"]
upl = item.get("contentDetails", {}).get("relatedPlaylists", {}).get("uploads", "")
upl_map[cid] = upl
time.sleep(0.2)

# 更新 row
fixed = 0
for r in rows:
upl = upl_map.get(r["channel_id"], "")
if upl:
r["uploads_playlist_id"] = upl
fixed += 1
print(f"已补 {fixed}/{len(rows)} 条 uploads_playlist_id")

# 写回
with csv_path.open("w", newline="", encoding="utf-8") as fp:
w = csv.DictWriter(fp, fieldnames=list(rows[0].keys()))
w.writeheader()
for r in rows:
w.writerow(r)
print(f"✅ 已更新 {csv_path}")


if __name__ == "__main__":
main()
Loading