From c520fb3a8d64a6dde29d6f8d269213f9a14fbd6e Mon Sep 17 00:00:00 2001 From: Milan Hauth Date: Sat, 27 Dec 2025 18:39:38 +0100 Subject: [PATCH 01/28] add setup.py --- config.py | 2 -- data/README.md | 3 --- src/substack2markdown/__init__.py | 0 .../substack2markdown/assets/author_template.html | 0 .../substack2markdown/assets}/css/essay-styles.css | 0 .../substack2markdown/assets}/css/style.css | 0 .../substack2markdown/assets}/images/screenshot.png | Bin .../substack2markdown/assets}/js/populate-essays.js | 0 .../substack2markdown/substack_scraper.py | 0 substack_html_pages/README.md | 3 --- 10 files changed, 8 deletions(-) delete mode 100644 config.py delete mode 100644 data/README.md create mode 100644 src/substack2markdown/__init__.py rename author_template.html => src/substack2markdown/assets/author_template.html (100%) rename {assets => src/substack2markdown/assets}/css/essay-styles.css (100%) rename {assets => src/substack2markdown/assets}/css/style.css (100%) rename {assets => src/substack2markdown/assets}/images/screenshot.png (100%) rename {assets => src/substack2markdown/assets}/js/populate-essays.js (100%) rename substack_scraper.py => src/substack2markdown/substack_scraper.py (100%) delete mode 100644 substack_html_pages/README.md diff --git a/config.py b/config.py deleted file mode 100644 index 8fc6bff2..00000000 --- a/config.py +++ /dev/null @@ -1,2 +0,0 @@ -EMAIL = "your-email@domain.com" -PASSWORD = "your-password" diff --git a/data/README.md b/data/README.md deleted file mode 100644 index 27476ca6..00000000 --- a/data/README.md +++ /dev/null @@ -1,3 +0,0 @@ -This directory will be used to store `.json` files for each writer -containing metadata that is used to populate a `.html` file for that -author. \ No newline at end of file diff --git a/src/substack2markdown/__init__.py b/src/substack2markdown/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/author_template.html b/src/substack2markdown/assets/author_template.html similarity index 100% rename from author_template.html rename to src/substack2markdown/assets/author_template.html diff --git a/assets/css/essay-styles.css b/src/substack2markdown/assets/css/essay-styles.css similarity index 100% rename from assets/css/essay-styles.css rename to src/substack2markdown/assets/css/essay-styles.css diff --git a/assets/css/style.css b/src/substack2markdown/assets/css/style.css similarity index 100% rename from assets/css/style.css rename to src/substack2markdown/assets/css/style.css diff --git a/assets/images/screenshot.png b/src/substack2markdown/assets/images/screenshot.png similarity index 100% rename from assets/images/screenshot.png rename to src/substack2markdown/assets/images/screenshot.png diff --git a/assets/js/populate-essays.js b/src/substack2markdown/assets/js/populate-essays.js similarity index 100% rename from assets/js/populate-essays.js rename to src/substack2markdown/assets/js/populate-essays.js diff --git a/substack_scraper.py b/src/substack2markdown/substack_scraper.py similarity index 100% rename from substack_scraper.py rename to src/substack2markdown/substack_scraper.py diff --git a/substack_html_pages/README.md b/substack_html_pages/README.md deleted file mode 100644 index 0931cf8e..00000000 --- a/substack_html_pages/README.md +++ /dev/null @@ -1,3 +0,0 @@ -This directory will be used to store `.html` files for each writer that will enable you -to browse and sort the downloaded markdown files for a given writer. One `.html` file -will be created for each writer. \ No newline at end of file From c73855e0a8aa17b916863c895acf8d6e741a2cf3 Mon Sep 17 00:00:00 2001 From: Milan Hauth Date: Sat, 27 Dec 2025 19:05:33 +0100 Subject: [PATCH 02/28] add parameters: config email password --- src/substack2markdown/substack_scraper.py | 42 ++++++++++++++++++----- 1 file changed, 33 insertions(+), 9 deletions(-) diff --git a/src/substack2markdown/substack_scraper.py b/src/substack2markdown/substack_scraper.py index 126d260d..734dc553 100644 --- a/src/substack2markdown/substack_scraper.py +++ b/src/substack2markdown/substack_scraper.py @@ -21,7 +21,6 @@ from selenium.common.exceptions import SessionNotCreatedException from selenium.webdriver.chrome.service import Service from urllib.parse import urlparse -from config import EMAIL, PASSWORD USE_PREMIUM: bool = True # Set to True if you want to login to Substack and convert paid for posts BASE_SUBSTACK_URL: str = "https://www.thefitzwilliam.com/" # Substack you want to convert to markdown @@ -70,9 +69,10 @@ def generate_html_file(author_name: str) -> None: class BaseSubstackScraper(ABC): - def __init__(self, base_substack_url: str, md_save_dir: str, html_save_dir: str): + def __init__(self, args, base_substack_url: str, md_save_dir: str, html_save_dir: str): if not base_substack_url.endswith("/"): base_substack_url += "/" + self.args = args self.base_substack_url: str = base_substack_url self.writer_name: str = extract_main_part(base_substack_url) @@ -371,8 +371,8 @@ def scrape_posts(self, num_posts_to_scrape: int = 0) -> None: class SubstackScraper(BaseSubstackScraper): - def __init__(self, base_substack_url: str, md_save_dir: str, html_save_dir: str): - super().__init__(base_substack_url, md_save_dir, html_save_dir) + def __init__(self, args, base_substack_url: str, md_save_dir: str, html_save_dir: str): + super().__init__(args, base_substack_url, md_save_dir, html_save_dir) def get_url_soup(self, url: str) -> Optional[BeautifulSoup]: """ @@ -392,6 +392,7 @@ def get_url_soup(self, url: str) -> Optional[BeautifulSoup]: class PremiumSubstackScraper(BaseSubstackScraper): def __init__( self, + args, base_substack_url: str, md_save_dir: str, html_save_dir: str, @@ -400,7 +401,7 @@ def __init__( edge_driver_path: str = '', user_agent: str = '' ) -> None: - super().__init__(base_substack_url, md_save_dir, html_save_dir) + super().__init__(args, base_substack_url, md_save_dir, html_save_dir) options = EdgeOptions() if headless: @@ -459,8 +460,8 @@ def login(self) -> None: # Email and password email = self.driver.find_element(By.NAME, "email") password = self.driver.find_element(By.NAME, "password") - email.send_keys(EMAIL) - password.send_keys(PASSWORD) + email.send_keys(self.args.email) + password.send_keys(self.args.password) # Find the submit button and click it. submit = self.driver.find_element(By.XPATH, "//*[@id=\"substack-login\"]/div[2]/div[2]/form/button") @@ -494,6 +495,15 @@ def get_url_soup(self, url: str) -> BeautifulSoup: def parse_args() -> argparse.Namespace: parser = argparse.ArgumentParser(description="Scrape a Substack site.") + parser.add_argument( + "--config", type=str, help="JSON config file with email and password." + ) + parser.add_argument( + "--email", type=str, help="Login E-Mail." + ) + parser.add_argument( + "--password", type=str, help="Login password." + ) parser.add_argument( "-u", "--url", type=str, help="The base URL of the Substack site to scrape." ) @@ -556,17 +566,29 @@ def main(): if args.html_directory is None: args.html_directory = BASE_HTML_DIR + if args.config: + with open(args.config) as f: + config = json.load(f) + args.email = config["email"] + args.password = config["password"] + # TODO more + + assert args.email + assert args.password + if args.url: if args.premium: scraper = PremiumSubstackScraper( - args.url, + args=args, + base_substack_url=args.url, headless=args.headless, md_save_dir=args.directory, html_save_dir=args.html_directory ) else: scraper = SubstackScraper( - args.url, + args=args, + base_substack_url=args.url, md_save_dir=args.directory, html_save_dir=args.html_directory ) @@ -575,6 +597,7 @@ def main(): else: # Use the hardcoded values at the top of the file if USE_PREMIUM: scraper = PremiumSubstackScraper( + args=args, base_substack_url=BASE_SUBSTACK_URL, md_save_dir=args.directory, html_save_dir=args.html_directory, @@ -583,6 +606,7 @@ def main(): ) else: scraper = SubstackScraper( + args=args, base_substack_url=BASE_SUBSTACK_URL, md_save_dir=args.directory, html_save_dir=args.html_directory From 15fc25c71720c2afba4b3bafceabb91b1ed672bb Mon Sep 17 00:00:00 2001 From: Milan Hauth Date: Sun, 28 Dec 2025 07:55:10 +0100 Subject: [PATCH 03/28] add parameters: assets-dir author-template --- src/substack2markdown/substack_scraper.py | 73 +++++++++++------------ 1 file changed, 35 insertions(+), 38 deletions(-) diff --git a/src/substack2markdown/substack_scraper.py b/src/substack2markdown/substack_scraper.py index 734dc553..297f0937 100644 --- a/src/substack2markdown/substack_scraper.py +++ b/src/substack2markdown/substack_scraper.py @@ -26,6 +26,7 @@ BASE_SUBSTACK_URL: str = "https://www.thefitzwilliam.com/" # Substack you want to convert to markdown BASE_MD_DIR: str = "substack_md_files" # Name of the directory we'll save the .md essay files BASE_HTML_DIR: str = "substack_html_pages" # Name of the directory we'll save the .html essay files +ASSETS_DIR: str = os.path.dirname(__file__) + "/assets" HTML_TEMPLATE: str = "author_template.html" # HTML template to use for the author page JSON_DATA_DIR: str = "data" NUM_POSTS_TO_SCRAPE: int = 3 # Set to 0 if you want all posts @@ -37,12 +38,12 @@ def extract_main_part(url: str) -> str: # present -def generate_html_file(author_name: str) -> None: +def generate_html_file(args, author_name: str) -> None: """ Generates a HTML file for the given author. """ - if not os.path.exists(BASE_HTML_DIR): - os.makedirs(BASE_HTML_DIR) + if not os.path.exists(args.html_directory): + os.makedirs(args.html_directory) # Read JSON data json_path = os.path.join(JSON_DATA_DIR, f'{author_name}.json') @@ -52,7 +53,7 @@ def generate_html_file(author_name: str) -> None: # Convert JSON data to a JSON string for embedding embedded_json_data = json.dumps(essays_data, ensure_ascii=False, indent=4) - with open(HTML_TEMPLATE, 'r', encoding='utf-8') as file: + with open(args.author_template, 'r', encoding='utf-8') as file: html_template = file.read() # Insert the JSON string into the script tag in the HTML template @@ -63,7 +64,7 @@ def generate_html_file(author_name: str) -> None: html_with_author = html_with_data.replace('author_name', author_name) # Write the modified HTML to a new file - html_output_path = os.path.join(BASE_HTML_DIR, f'{author_name}.html') + html_output_path = os.path.join(args.html_directory, f'{author_name}.html') with open(html_output_path, 'w', encoding='utf-8') as file: file.write(html_with_author) @@ -193,7 +194,7 @@ def save_to_html_file(self, filepath: str, content: str) -> None: # Calculate the relative path from the HTML file to the CSS file html_dir = os.path.dirname(filepath) - css_path = os.path.relpath("./assets/css/essay-styles.css", html_dir) + css_path = os.path.relpath(args.assets_dir + "/css/essay-styles.css", html_dir) css_path = css_path.replace("\\", "/") # Ensure forward slashes for web paths html_content = f""" @@ -367,7 +368,7 @@ def scrape_posts(self, num_posts_to_scrape: int = 0) -> None: if num_posts_to_scrape != 0 and count == num_posts_to_scrape: break self.save_essays_data_to_json(essays_data=essays_data) - generate_html_file(author_name=self.writer_name) + generate_html_file(self.args, author_name=self.writer_name) class SubstackScraper(BaseSubstackScraper): @@ -505,14 +506,22 @@ def parse_args() -> argparse.Namespace: "--password", type=str, help="Login password." ) parser.add_argument( - "-u", "--url", type=str, help="The base URL of the Substack site to scrape." + "-u", + "--url", # args.url + type=str, + default=BASE_SUBSTACK_URL, + help="The base URL of the Substack site to scrape." ) parser.add_argument( - "-d", "--directory", type=str, help="The directory to save scraped posts." + "-d", + "--directory", # args.directory + type=str, + default=BASE_MD_DIR, + help="The directory to save scraped posts." ) parser.add_argument( "-n", - "--number", + "--number", # args.number type=int, default=0, help="The number of posts to scrape. If 0 or not provided, all posts will be scraped.", @@ -523,6 +532,15 @@ def parse_args() -> argparse.Namespace: action="store_true", help="Include -p in command to use the Premium Substack Scraper with selenium.", ) + parser.add_argument( + "--assets-dir", # args.assets_dir + default=ASSETS_DIR, + help=f"Path to assets directory. Default: {ASSETS_DIR!r}", + ) + parser.add_argument( + "--author-template", # args.author_template + help=f"Path to author_template.html. Default: {repr('{assets_dir}/' + HTML_TEMPLATE)}", + ) parser.add_argument( "--headless", action="store_true", @@ -549,9 +567,10 @@ def parse_args() -> argparse.Namespace: "passing captcha in headless mode", ) parser.add_argument( - "--html-directory", + "--html-directory", # args.html_directory type=str, - help="The directory to save scraped posts as HTML files.", + default=BASE_HTML_DIR, + help=f"The directory to save scraped posts as HTML files. Default: {BASE_HTML_DIR!r}", ) return parser.parse_args() @@ -560,12 +579,6 @@ def parse_args() -> argparse.Namespace: def main(): args = parse_args() - if args.directory is None: - args.directory = BASE_MD_DIR - - if args.html_directory is None: - args.html_directory = BASE_HTML_DIR - if args.config: with open(args.config) as f: config = json.load(f) @@ -576,7 +589,10 @@ def main(): assert args.email assert args.password - if args.url: + if not args.author_template: + args.author_template = args.assets_dir + "/" + HTML_TEMPLATE + + if True: if args.premium: scraper = PremiumSubstackScraper( args=args, @@ -594,25 +610,6 @@ def main(): ) scraper.scrape_posts(args.number) - else: # Use the hardcoded values at the top of the file - if USE_PREMIUM: - scraper = PremiumSubstackScraper( - args=args, - base_substack_url=BASE_SUBSTACK_URL, - md_save_dir=args.directory, - html_save_dir=args.html_directory, - edge_path=args.edge_path, - edge_driver_path=args.edge_driver_path - ) - else: - scraper = SubstackScraper( - args=args, - base_substack_url=BASE_SUBSTACK_URL, - md_save_dir=args.directory, - html_save_dir=args.html_directory - ) - scraper.scrape_posts(num_posts_to_scrape=NUM_POSTS_TO_SCRAPE) - if __name__ == "__main__": main() From 8d7676f947cfad94cf32eb013ed12f32c2bc90f8 Mon Sep 17 00:00:00 2001 From: Milan Hauth Date: Sun, 28 Dec 2025 10:05:02 +0100 Subject: [PATCH 04/28] use selenium_driverless --- requirements.txt | 3 +- src/substack2markdown/substack_scraper.py | 205 ++++++++++++---------- 2 files changed, 113 insertions(+), 95 deletions(-) diff --git a/requirements.txt b/requirements.txt index c58926a7..af704d1a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,6 @@ bs4==0.0.1 html2text==2020.1.16 requests==2.31.0 -selenium==4.16.0 +selenium-driverless tqdm==4.66.1 -webdriver_manager==4.0.1 Markdown==3.6 diff --git a/src/substack2markdown/substack_scraper.py b/src/substack2markdown/substack_scraper.py index 297f0937..0f22adc5 100644 --- a/src/substack2markdown/substack_scraper.py +++ b/src/substack2markdown/substack_scraper.py @@ -4,7 +4,9 @@ from abc import ABC, abstractmethod from typing import List, Optional, Tuple from time import sleep - +import asyncio +import atexit +import signal import html2text import markdown @@ -14,12 +16,8 @@ from tqdm import tqdm from xml.etree import ElementTree as ET -from selenium import webdriver -from selenium.webdriver.common.by import By -from webdriver_manager.microsoft import EdgeChromiumDriverManager -from selenium.webdriver.edge.options import Options as EdgeOptions -from selenium.common.exceptions import SessionNotCreatedException -from selenium.webdriver.chrome.service import Service +from selenium_driverless import webdriver +from selenium_driverless.types.by import By from urllib.parse import urlparse USE_PREMIUM: bool = True # Set to True if you want to login to Substack and convert paid for posts @@ -70,6 +68,15 @@ def generate_html_file(args, author_name: str) -> None: class BaseSubstackScraper(ABC): + def __await__(self): + return self._async_init().__await__() + + async def __aenter__(self): + return await self + + async def __aexit__(self, exc_type, exc, tb): + pass + def __init__(self, args, base_substack_url: str, md_save_dir: str, html_save_dir: str): if not base_substack_url.endswith("/"): base_substack_url += "/" @@ -92,6 +99,10 @@ def __init__(self, args, base_substack_url: str, md_save_dir: str, html_save_dir self.keywords: List[str] = ["about", "archive", "podcast"] self.post_urls: List[str] = self.get_all_post_urls() + async def _async_init(self): + self._loop = asyncio.get_running_loop() + return self + def get_all_post_urls(self) -> List[str]: """ Attempts to fetch URLs from sitemap.xml, falling back to feed.xml if necessary. @@ -326,7 +337,7 @@ def save_essays_data_to_json(self, essays_data: list) -> None: with open(json_path, 'w', encoding='utf-8') as f: json.dump(essays_data, f, ensure_ascii=False, indent=4) - def scrape_posts(self, num_posts_to_scrape: int = 0) -> None: + async def scrape_posts(self, num_posts_to_scrape: int = 0) -> None: """ Iterates over all posts and saves them as markdown and html files """ @@ -340,8 +351,7 @@ def scrape_posts(self, num_posts_to_scrape: int = 0) -> None: md_filepath = os.path.join(self.md_save_dir, md_filename) html_filepath = os.path.join(self.html_save_dir, html_filename) - if not os.path.exists(md_filepath): - soup = self.get_url_soup(url) + soup = await self.get_url_soup(url) if soup is None: total += 1 continue @@ -398,100 +408,109 @@ def __init__( md_save_dir: str, html_save_dir: str, headless: bool = False, - edge_path: str = '', - edge_driver_path: str = '', + chromium_path: str = '', user_agent: str = '' ) -> None: super().__init__(args, base_substack_url, md_save_dir, html_save_dir) - options = EdgeOptions() + self.driver = None + + def exit_handler(signum, frame): + print() + print(f"exit_handler: received signal {signum}") + try: + asyncio.get_event_loop().create_task(self._cleanup_sync()) + except Exception: + pass + raise SystemExit(0) + + signal.signal(signal.SIGINT, exit_handler) + signal.signal(signal.SIGTERM, exit_handler) + + atexit.register(self._cleanup_sync) + + options = webdriver.ChromeOptions() + self.chrome_options = options if headless: - # modern headless flag (works better with recent Edge/Chromium) + # modern headless flag (works better with recent Chromium) options.add_argument("--headless=new") - if edge_path: - options.binary_location = edge_path + if chromium_path: + options.binary_location = chromium_path if user_agent: options.add_argument(f"user-agent={user_agent}") - - if isinstance(options, EdgeOptions): - os.environ.setdefault("SE_DRIVER_MIRROR_URL", "https://msedgedriver.microsoft.com") - elif isinstance(options, ChromeOptions): - os.environ.setdefault("SE_DRIVER_MIRROR_URL", "https://chromedriver.storage.googleapis.com") - - self.driver = None + async def _async_init(self): + self._loop = asyncio.get_running_loop() - # 1) Prefer an explicit driver path (manual download) - if edge_driver_path and os.path.exists(edge_driver_path): - service = Service(executable_path=edge_driver_path) - self.driver = webdriver.Edge(service=service, options=options) - else: - # 2) Try webdriver_manager (needs network/DNS) - try: - service = Service(EdgeChromiumDriverManager().install()) - self.driver = webdriver.Edge(service=service, options=options) - except Exception as e: - print("webdriver_manager could not download msedgedriver (network/DNS). Falling back to Selenium Manager.") - # 3) Selenium Manager fallback (still needs network; but avoids webdriver_manager) + await self._start_driver() + await self.login() + return self + + async def _start_driver(self): + self.driver = await webdriver.Chrome(options=self.chrome_options) + + async def __aexit__(self, exc_type, exc, tb): + await self.close() + + async def close(self) -> None: + if self.driver: + await self.driver.quit() + + def _cleanup_sync(self): + try: + if not self.driver: + return + proc = self.driver._process + if proc and proc.poll() is None: + proc.terminate() try: - # IMPORTANT: ensure no stale driver in PATH (e.g. C:\Windows\msedgedriver.exe v138) - self.driver = webdriver.Edge(options=options) - except SessionNotCreatedException as se: - raise RuntimeError( - "Selenium Manager fallback failed due to driver/browser mismatch.\n" - "Fix by either: (a) removing stale msedgedriver in PATH (e.g. C:\\Windows\\msedgedriver.exe) and replace with a fresh one downloaded from https://developer.microsoft.com/en-us/microsoft-edge/tools/webdriver, " - "or (b) pass --edge-driver-path to a manually downloaded driver that matches your Edge version." - ) from se + proc.wait(timeout=1) + except Exception: + proc.kill() + except Exception as exc: + print("_cleanup_sync failed:", exc) + + async def login(self): + await self.driver.get("https://substack.com/sign-in") + await asyncio.sleep(2) + + signin = await self.driver.find_element( + By.XPATH, "//a[contains(@class,'login-option')]" + ) + await signin.click() - self.login() + await asyncio.sleep(2) - def login(self) -> None: - """ - This method logs into Substack using Selenium - """ - self.driver.get("https://substack.com/sign-in") - sleep(3) + email = await self.driver.find_element(By.NAME, "email") + password = await self.driver.find_element(By.NAME, "password") - signin_with_password = self.driver.find_element( - By.XPATH, "//a[@class='login-option substack-login__login-option']" + await email.send_keys(self.args.email) + await password.send_keys(self.args.password) + + submit = await self.driver.find_element( + By.XPATH, "//*[@id='substack-login']//form//button" ) - signin_with_password.click() - sleep(3) - - # Email and password - email = self.driver.find_element(By.NAME, "email") - password = self.driver.find_element(By.NAME, "password") - email.send_keys(self.args.email) - password.send_keys(self.args.password) - - # Find the submit button and click it. - submit = self.driver.find_element(By.XPATH, "//*[@id=\"substack-login\"]/div[2]/div[2]/form/button") - submit.click() - sleep(30) # Wait for the page to load - - if self.is_login_failed(): - raise Exception( - "Warning: Login unsuccessful. Please check your email and password, or your account status.\n" - "Use the non-premium scraper for the non-paid posts. \n" - "If running headless, run non-headlessly to see if blocked by Captcha." - ) + await submit.click() - def is_login_failed(self) -> bool: + await asyncio.sleep(8) + + if await self.is_login_failed(): + raise RuntimeError("Substack login failed") + + async def is_login_failed(self): """ Check for the presence of the 'error-container' to indicate a failed login attempt. """ - error_container = self.driver.find_elements(By.ID, 'error-container') - return len(error_container) > 0 and error_container[0].is_displayed() + elements = await self.driver.find_elements(By.ID, "error-container") + return bool(elements) - def get_url_soup(self, url: str) -> BeautifulSoup: + async def get_url_soup(self, url: str): """ Gets soup from URL using logged in selenium driver """ - try: - self.driver.get(url) - return BeautifulSoup(self.driver.page_source, "html.parser") - except Exception as e: - raise ValueError(f"Error fetching page: {e}") from e + await self.driver.get(url) + html = await self.driver.page_source + return BeautifulSoup(html, "html.parser") def parse_args() -> argparse.Namespace: @@ -548,16 +567,10 @@ def parse_args() -> argparse.Namespace: "Scraper.", ) parser.add_argument( - "--edge-path", + "--chromium-path", # args.chromium_path type=str, default="", - help='Optional: The path to the Edge browser executable (i.e. "path_to_msedge.exe").', - ) - parser.add_argument( - "--edge-driver-path", - type=str, - default="", - help='Optional: The path to the Edge WebDriver executable (i.e. "path_to_msedgedriver.exe").', + help='Optional: The path to the Chromium browser executable (i.e. "path/to/chromium").', ) parser.add_argument( "--user-agent", @@ -576,7 +589,7 @@ def parse_args() -> argparse.Namespace: return parser.parse_args() -def main(): +async def async_main(): args = parse_args() if args.config: @@ -594,7 +607,7 @@ def main(): if True: if args.premium: - scraper = PremiumSubstackScraper( + scraper = await PremiumSubstackScraper( args=args, base_substack_url=args.url, headless=args.headless, @@ -602,13 +615,19 @@ def main(): html_save_dir=args.html_directory ) else: - scraper = SubstackScraper( + scraper = await SubstackScraper( args=args, base_substack_url=args.url, md_save_dir=args.directory, html_save_dir=args.html_directory ) - scraper.scrape_posts(args.number) + + await scraper.scrape_posts(args.number) + await scraper.close() + + +def main(): + asyncio.run(async_main()) if __name__ == "__main__": From 4af8b45d9a477df427a352b5038ea14d983451ce Mon Sep 17 00:00:00 2001 From: Milan Hauth Date: Sun, 28 Dec 2025 10:07:52 +0100 Subject: [PATCH 05/28] replace existing files --- src/substack2markdown/substack_scraper.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/substack2markdown/substack_scraper.py b/src/substack2markdown/substack_scraper.py index 0f22adc5..91c3f2ac 100644 --- a/src/substack2markdown/substack_scraper.py +++ b/src/substack2markdown/substack_scraper.py @@ -178,7 +178,8 @@ def save_to_file(filepath: str, content: str) -> None: if not isinstance(content, str): raise ValueError("content must be a string") - if os.path.exists(filepath): + # if os.path.exists(filepath): + if False: print(f"File already exists: {filepath}") return @@ -351,6 +352,8 @@ async def scrape_posts(self, num_posts_to_scrape: int = 0) -> None: md_filepath = os.path.join(self.md_save_dir, md_filename) html_filepath = os.path.join(self.html_save_dir, html_filename) + # if not os.path.exists(md_filepath): + if True: soup = await self.get_url_soup(url) if soup is None: total += 1 From 4b8598af5affd08af4a56e1329f47959f520777c Mon Sep 17 00:00:00 2001 From: Milan Hauth Date: Sun, 28 Dec 2025 11:15:06 +0100 Subject: [PATCH 06/28] fixup: assets-dir --- src/substack2markdown/substack_scraper.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/substack2markdown/substack_scraper.py b/src/substack2markdown/substack_scraper.py index 91c3f2ac..fc3bdabf 100644 --- a/src/substack2markdown/substack_scraper.py +++ b/src/substack2markdown/substack_scraper.py @@ -206,7 +206,7 @@ def save_to_html_file(self, filepath: str, content: str) -> None: # Calculate the relative path from the HTML file to the CSS file html_dir = os.path.dirname(filepath) - css_path = os.path.relpath(args.assets_dir + "/css/essay-styles.css", html_dir) + css_path = os.path.relpath(self.args.assets_dir + "/css/essay-styles.css", html_dir) css_path = css_path.replace("\\", "/") # Ensure forward slashes for web paths html_content = f""" From 5811bb5de77a0302e619936afaa7fbe67d45b307 Mon Sep 17 00:00:00 2001 From: Milan Hauth Date: Sun, 28 Dec 2025 12:34:02 +0100 Subject: [PATCH 07/28] download images based on https://github.com/timf34/Substack2Markdown/pull/26 --- src/substack2markdown/substack_scraper.py | 186 +++++++++++++++++++++- 1 file changed, 185 insertions(+), 1 deletion(-) diff --git a/src/substack2markdown/substack_scraper.py b/src/substack2markdown/substack_scraper.py index fc3bdabf..e2cc62ae 100644 --- a/src/substack2markdown/substack_scraper.py +++ b/src/substack2markdown/substack_scraper.py @@ -1,6 +1,13 @@ import argparse import json import os +import io +import re +import base64 +import hashlib +import mimetypes +from pathlib import Path +from urllib.parse import urlparse, unquote from abc import ABC, abstractmethod from typing import List, Optional, Tuple from time import sleep @@ -18,18 +25,54 @@ from selenium_driverless import webdriver from selenium_driverless.types.by import By -from urllib.parse import urlparse USE_PREMIUM: bool = True # Set to True if you want to login to Substack and convert paid for posts BASE_SUBSTACK_URL: str = "https://www.thefitzwilliam.com/" # Substack you want to convert to markdown BASE_MD_DIR: str = "substack_md_files" # Name of the directory we'll save the .md essay files BASE_HTML_DIR: str = "substack_html_pages" # Name of the directory we'll save the .html essay files +BASE_IMAGE_DIR: str = "substack_images" ASSETS_DIR: str = os.path.dirname(__file__) + "/assets" HTML_TEMPLATE: str = "author_template.html" # HTML template to use for the author page JSON_DATA_DIR: str = "data" NUM_POSTS_TO_SCRAPE: int = 3 # Set to 0 if you want all posts +def count_images_in_markdown(md_content: str) -> int: + """Count number of Substack CDN image URLs in markdown content.""" + # [![](https://substackcdn.com/image/fetch/x.png)](https://substackcdn.com/image/fetch/x.png) + # regex lookahead: match "...)" but not "...)]" suffix + pattern = re.compile(r'\(https://substackcdn\.com/image/fetch/[^\s\)]+\)(?=[^\]]|$)') + matches = re.findall(pattern, md_content) + return len(matches) + + +def sanitize_image_filename(url: str) -> str: + """Create a safe filename from URL or content.""" + # Extract original filename from CDN URL + if "substackcdn.com" in url: + # Get the actual image URL after the CDN parameters + original_url = unquote(url.split("/https%3A%2F%2F")[1]) + filename = original_url.split("/")[-1] + else: + filename = url.split("/")[-1] + + # Remove invalid characters + filename = re.sub(r'[<>:"/\\|?*]', '', filename) + + # If filename is too long or empty, create hash-based name + if len(filename) > 100 or not filename: + hash_object = hashlib.md5(url.encode()) + ext = mimetypes.guess_extension(requests.head(url).headers.get('content-type', '')) or '.jpg' + filename = f"{hash_object.hexdigest()}{ext}" + + return filename + + +def get_post_slug(url: str) -> str: + match = re.search(r'/p/([^/]+)', url) + return match.group(1) if match else 'unknown_post' + + def extract_main_part(url: str) -> str: parts = urlparse(url).netloc.split('.') # Parse the URL to get the netloc, and split on '.' return parts[1] if parts[0] == 'www' else parts[0] # Return the main part of the domain, while ignoring 'www' if @@ -96,6 +139,9 @@ def __init__(self, args, base_substack_url: str, md_save_dir: str, html_save_dir os.makedirs(self.html_save_dir) print(f"Created html directory {self.html_save_dir}") + if not self.args.no_images: + os.makedirs(self.args.image_directory, exist_ok=True) + self.keywords: List[str] = ["about", "archive", "podcast"] self.post_urls: List[str] = self.get_all_post_urls() @@ -359,6 +405,13 @@ async def scrape_posts(self, num_posts_to_scrape: int = 0) -> None: total += 1 continue title, subtitle, like_count, date, md = self.extract_post_data(soup) + + if not self.args.no_images: + total_images = count_images_in_markdown(md) + post_slug = get_post_slug(url) + with tqdm(total=total_images, desc=f"Downloading images for {post_slug}", leave=False) as img_pbar: + md = await self.process_markdown_images(md, self.writer_name, post_slug, img_pbar) + self.save_to_file(md_filepath, md) # Convert markdown to HTML and save @@ -383,6 +436,56 @@ async def scrape_posts(self, num_posts_to_scrape: int = 0) -> None: self.save_essays_data_to_json(essays_data=essays_data) generate_html_file(self.args, author_name=self.writer_name) + async def download_image( + self, + url: str, + save_path: Path, + pbar: Optional[tqdm] = None + ) -> Optional[str]: + """Download image from URL and save to path.""" + try: + response = requests.get(url, stream=True) + if response.status_code == 200: + save_path.parent.mkdir(parents=True, exist_ok=True) + with open(save_path, 'wb') as f: + for chunk in response.iter_content(chunk_size=8192): + if chunk: + f.write(chunk) + if pbar: + pbar.update(1) + return str(save_path) + except Exception as exc: + if pbar: + pbar.write(f"Error downloading image {url}: {str(exc)}") + # raise exc # debug + return None + + async def process_markdown_images( + self, + md_content: str, + author: str, + post_slug: str, + pbar=None + ) -> str: + """Process markdown content to download images and update references.""" + image_dir = Path(self.args.image_directory) / author / post_slug + # [![](https://substackcdn.com/image/fetch/x.png)](https://substackcdn.com/image/fetch/x.png) + pattern = re.compile(r'\(https://substackcdn\.com/image/fetch/[^\s\)]+\)') + buf = io.StringIO() + last_end = 0 + for match in pattern.finditer(md_content): + buf.write(md_content[last_end:match.start()]) + url = match.group(0).strip("()") + filename = sanitize_image_filename(url) + save_path = image_dir / filename + if not save_path.exists(): + await self.download_image(url, save_path, pbar) + rel_path = os.path.relpath(save_path, Path(self.args.directory) / author) + buf.write(f"({rel_path})") + last_end = match.end() + buf.write(md_content[last_end:]) + return buf.getvalue() + class SubstackScraper(BaseSubstackScraper): def __init__(self, args, base_substack_url: str, md_save_dir: str, html_save_dir: str): @@ -515,6 +618,76 @@ async def get_url_soup(self, url: str): html = await self.driver.page_source return BeautifulSoup(html, "html.parser") + async def download_image_FIXME( + self, + url: str, + save_path: Path, + pbar: Optional[tqdm] = None + ) -> Optional[str]: + """Download image using selenium_driverless""" + + # NOTE for now this works with the default "def download_image" + + # WONTFIX "fetch" fails due to CORS policy + + # WONTFIX "canvas" does not return the original image bytes + + # we could fetch images with CDP Network.getResponseBody + # but that requires lots of boilerplate code + # fix: use https://github.com/milahu/aiohttp_chromium + + try: + # Execute JS fetch inside browser + result = await self.driver.execute_async_script( + """ + const url = arguments[0]; + const callback = arguments[arguments.length - 1]; + + const img = new Image(); + img.crossOrigin = 'Anonymous'; // try to avoid CORS issues + img.onload = () => { + try { + const canvas = document.createElement('canvas'); + canvas.width = img.width; + canvas.height = img.height; + const ctx = canvas.getContext('2d'); + ctx.drawImage(img, 0, 0); + const dataUrl = canvas.toDataURL('image/png'); // returns "data:image/png;base64,..." + const base64 = dataUrl.split(',')[1]; // strip prefix + callback({data: base64}); + } catch (err) { + callback({error: err.message, stack: err.stack}); + } + }; + img.onerror = (err) => { + callback({error: 'Image load error', stack: err.toString()}); + }; + img.src = url; + """, + url + ) + + if isinstance(result, dict) and "error" in result: + raise RuntimeError(f"{result['error']}\nJS stack:\n{result['stack']}") + + # Decode base64 to bytes + image_bytes = base64.b64decode(result) + + save_path.parent.mkdir(parents=True, exist_ok=True) + with open(save_path, "wb") as f: + f.write(image_bytes) + + if pbar: + pbar.update(1) + + return str(save_path) + + except Exception as exc: + if pbar: + pbar.write(f"Error downloading image {url}: {exc}") + # raise exc # debug + return None + def parse_args() -> argparse.Namespace: parser = argparse.ArgumentParser(description="Scrape a Substack site.") @@ -588,6 +761,17 @@ def parse_args() -> argparse.Namespace: default=BASE_HTML_DIR, help=f"The directory to save scraped posts as HTML files. Default: {BASE_HTML_DIR!r}", ) + parser.add_argument( + "--image-directory", # args.image_directory + type=str, + default=BASE_IMAGE_DIR, + help=f"The directory to save scraped image files. Default: {BASE_IMAGE_DIR!r}", + ) + parser.add_argument( + "--no-images", # args.no_images + action="store_true", + help=f"Do not download images.", + ) return parser.parse_args() From 153746f5b76d664526db9580259d3be6aa4bcd59 Mon Sep 17 00:00:00 2001 From: Milan Hauth Date: Sun, 28 Dec 2025 18:38:46 +0100 Subject: [PATCH 08/28] download comments fix https://github.com/timf34/Substack2Markdown/issues/3 --- src/substack2markdown/substack_scraper.py | 129 ++++++++++++++++++++++ 1 file changed, 129 insertions(+) diff --git a/src/substack2markdown/substack_scraper.py b/src/substack2markdown/substack_scraper.py index e2cc62ae..d2f2ad3a 100644 --- a/src/substack2markdown/substack_scraper.py +++ b/src/substack2markdown/substack_scraper.py @@ -363,6 +363,98 @@ def extract_post_data(self, soup: BeautifulSoup) -> Tuple[str, str, str, str, st return title, subtitle, like_count, date, md_content + async def get_window_preloads(self, soup): + # all comments are stored in javascript + # + # only some comments are rendered in html + # with buttons to "Expand full comment" and "Load More" + # see also + # https://www.selfpublife.com/p/automatically-expand-all-substack-comments + window_preloads = None + for script_element in soup.select("script"): + script_text = script_element.text.strip() + if not script_text.startswith("window._preloads"): + continue + # pos1 = re.search(r'window._preloads\s*=\s*JSON\.parse\(', script_text).span()[1] + pos1 = script_text.find("(") + 1 + pos2 = script_text.rfind(")") + window_preloads = json.loads(json.loads(script_text[pos1:pos2])) + break + assert window_preloads, f"not found at {url!r}" + return window_preloads + + def count_comments(self, comments_preloads): + + def count_comments_inner(comment): + res = 1 + for child_comment in comment["children"]: + res += count_comments_inner(child_comment) + return res + + res = 0 + for comment in comments_preloads["initialComments"]: + res += count_comments_inner(comment) + return res + + def render_comments_html(self, comments_preloads): + + def render_comment_body(body): + body = body.strip() + body = "

" + body + "

" + body = body.replace("\n", "

\n

") + # TODO more? + return body + + def render_comments_html_inner(comment, buf): + assert comment["type"] == "comment", f'unexpected comment type: {comment["type"]!r}' + buf.write(f'

\n') + buf.write(f'\n') + + # NOTE user IDs are constant, user handles are variable + # when i change my user handle + # then other users can use my old user handle + buf.write(f'') + buf.write(comment["name"]) # human-readable username + buf.write('\n') + + other_pub = comment["metadata"].get("author_on_other_pub") + if other_pub: + # NOTE publication handles are quasi-constant: + # when i change my publication handle + # then other users cannot use my old publication handle + # NOTE "Changing your publication's subdomain + # does not automatically set up a redirect from the old subdomain to the new one." + buf.write(f'(') + buf.write(other_pub["name"]) + buf.write(')\n') + + buf.write(comment["date"] + '\n') # "2025-05-17T06:51:39.485Z" + + for reaction, reaction_count in comment["reactions"].items(): + if reaction_count == 0: continue + buf.write(reaction + str(reaction_count) + '\n') # "❤123" + # buf.write(str(reaction_count) + reaction + '\n') # "123❤" + + buf.write('\n') + + buf.write('
\n') + buf.write('\n') + buf.write(render_comment_body(comment["body"]) + '\n') + + for child_comment in comment["children"]: + buf.write('\n') + render_comments_html_inner(child_comment, buf) + buf.write('
\n') + + buf.write('
\n') + buf.write('\n') + + buf = io.StringIO() + # NOTE the name "initial" is misleading. all comments are stored in this array + # NOTE comments are sorted by likes + for comment in comments_preloads["initialComments"]: + render_comments_html_inner(comment, buf) + return buf.getvalue() @abstractmethod def get_url_soup(self, url: str) -> str: @@ -412,6 +504,37 @@ async def scrape_posts(self, num_posts_to_scrape: int = 0) -> None: with tqdm(total=total_images, desc=f"Downloading images for {post_slug}", leave=False) as img_pbar: md = await self.process_markdown_images(md, self.writer_name, post_slug, img_pbar) + comments_html = None + comments_num = None + if not self.args.no_comments: + comments_url = url + "/comments" + # comments_url = "https://willstorr.substack.com/p/scamming-substack/comments" # test + comments_soup = await self.get_url_soup(comments_url) + comments_preloads = await self.get_window_preloads(comments_soup) + if 0: + # debug + # TODO add option to write the original "preloads" data to json files + with open("comments_preloads.json", "w") as f: + json.dump(comments_preloads, f, indent=2) + raise 5 + comments_num = self.count_comments(comments_preloads) + if comments_num > 0: + comments_html = self.render_comments_html(comments_preloads) + comments_html = ( + '\n\n' + + '
\n' + + # this can collide with other elements with id="comments" + # '
\n' + + '
\n' + + '

Comments

\n' + + '
\n' + + f'{comments_num} comments\n' + + comments_html + '\n' + + '
' + '
' + ) + md += comments_html + self.save_to_file(md_filepath, md) # Convert markdown to HTML and save @@ -422,6 +545,7 @@ async def scrape_posts(self, num_posts_to_scrape: int = 0) -> None: "title": title, "subtitle": subtitle, "like_count": like_count, + "comment_count": comments_num, "date": date, "file_link": md_filepath, "html_link": html_filepath @@ -772,6 +896,11 @@ def parse_args() -> argparse.Namespace: action="store_true", help=f"Do not download images.", ) + parser.add_argument( + "--no-comments", # args.no_comments + action="store_true", + help=f"Do not download comments.", + ) return parser.parse_args() From 591fa86b49e6f38d802c741e3d8b8f10365c2cf8 Mon Sep 17 00:00:00 2001 From: Milan Hauth Date: Sun, 28 Dec 2025 22:37:11 +0100 Subject: [PATCH 09/28] handle removed comments --- src/substack2markdown/substack_scraper.py | 32 ++++++++++++++++++++--- 1 file changed, 28 insertions(+), 4 deletions(-) diff --git a/src/substack2markdown/substack_scraper.py b/src/substack2markdown/substack_scraper.py index d2f2ad3a..27890458 100644 --- a/src/substack2markdown/substack_scraper.py +++ b/src/substack2markdown/substack_scraper.py @@ -413,9 +413,19 @@ def render_comments_html_inner(comment, buf): # NOTE user IDs are constant, user handles are variable # when i change my user handle # then other users can use my old user handle - buf.write(f'') - buf.write(comment["name"]) # human-readable username - buf.write('\n') + if not comment["user_id"] is None: + buf.write(f'') + + if not comment["name"] is None: + buf.write(comment["name"]) # human-readable username + else: + # Comment removed + buf.write("null") + + if not comment["user_id"] is None: + buf.write('\n') + else: + buf.write('\n') other_pub = comment["metadata"].get("author_on_other_pub") if other_pub: @@ -439,7 +449,21 @@ def render_comments_html_inner(comment, buf): buf.write('
\n') buf.write('\n') - buf.write(render_comment_body(comment["body"]) + '\n') + + if comment["body"] is None: + # Comment removed + status = comment.get("status") + if status is None: + buf.write('(Comment removed)\n') + else: + # "moderator_removed", ... + buf.write('(status:' + status + ')\n') + # TODO comment["bans"] + # TODO comment["suppressed"] + # TODO comment["user_banned"] + # TODO comment["user_banned_for_comment"] + else: + buf.write(render_comment_body(comment["body"]) + '\n') for child_comment in comment["children"]: buf.write('\n') From 1458d78ea4828be288c73abdf5462641047813b2 Mon Sep 17 00:00:00 2001 From: Milan Hauth Date: Sun, 28 Dec 2025 22:37:48 +0100 Subject: [PATCH 10/28] add debug comment --- src/substack2markdown/substack_scraper.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/substack2markdown/substack_scraper.py b/src/substack2markdown/substack_scraper.py index 27890458..2a1c2727 100644 --- a/src/substack2markdown/substack_scraper.py +++ b/src/substack2markdown/substack_scraper.py @@ -578,6 +578,7 @@ async def scrape_posts(self, num_posts_to_scrape: int = 0) -> None: print(f"File already exists: {md_filepath}") except Exception as e: print(f"Error scraping post: {e}") + # raise e # debug count += 1 if num_posts_to_scrape != 0 and count == num_posts_to_scrape: break From 07e4c1d1581d81d63ff6ac7624da03fa513047d7 Mon Sep 17 00:00:00 2001 From: Milan Hauth Date: Sun, 28 Dec 2025 23:26:21 +0100 Subject: [PATCH 11/28] write JSON files --- src/substack2markdown/substack_scraper.py | 35 +++++++++++++++++++---- 1 file changed, 29 insertions(+), 6 deletions(-) diff --git a/src/substack2markdown/substack_scraper.py b/src/substack2markdown/substack_scraper.py index 2a1c2727..d31ca8dd 100644 --- a/src/substack2markdown/substack_scraper.py +++ b/src/substack2markdown/substack_scraper.py @@ -31,6 +31,7 @@ BASE_MD_DIR: str = "substack_md_files" # Name of the directory we'll save the .md essay files BASE_HTML_DIR: str = "substack_html_pages" # Name of the directory we'll save the .html essay files BASE_IMAGE_DIR: str = "substack_images" +BASE_JSON_DIR: str = "substack_json" ASSETS_DIR: str = os.path.dirname(__file__) + "/assets" HTML_TEMPLATE: str = "author_template.html" # HTML template to use for the author page JSON_DATA_DIR: str = "data" @@ -132,6 +133,8 @@ def __init__(self, args, base_substack_url: str, md_save_dir: str, html_save_dir self.md_save_dir: str = md_save_dir self.html_save_dir: str = f"{html_save_dir}/{self.writer_name}" + self.args.json_directory += f"/{self.writer_name}" + if not os.path.exists(md_save_dir): os.makedirs(md_save_dir) print(f"Created md directory {md_save_dir}") @@ -142,6 +145,9 @@ def __init__(self, args, base_substack_url: str, md_save_dir: str, html_save_dir if not self.args.no_images: os.makedirs(self.args.image_directory, exist_ok=True) + if not self.args.no_json: + os.makedirs(self.args.json_directory, exist_ok=True) + self.keywords: List[str] = ["about", "archive", "podcast"] self.post_urls: List[str] = self.get_all_post_urls() @@ -535,12 +541,11 @@ async def scrape_posts(self, num_posts_to_scrape: int = 0) -> None: # comments_url = "https://willstorr.substack.com/p/scamming-substack/comments" # test comments_soup = await self.get_url_soup(comments_url) comments_preloads = await self.get_window_preloads(comments_soup) - if 0: - # debug - # TODO add option to write the original "preloads" data to json files - with open("comments_preloads.json", "w") as f: - json.dump(comments_preloads, f, indent=2) - raise 5 + if not self.args.no_json: + json_filename = self.get_filename_from_url(url, filetype=".comments.json") + json_filepath = os.path.join(self.args.json_directory, json_filename) + _json = json.dumps(comments_preloads, ensure_ascii=False, separators=(',', ':')) + self.save_to_file(json_filepath, _json) comments_num = self.count_comments(comments_preloads) if comments_num > 0: comments_html = self.render_comments_html(comments_preloads) @@ -561,6 +566,13 @@ async def scrape_posts(self, num_posts_to_scrape: int = 0) -> None: self.save_to_file(md_filepath, md) + if not self.args.no_json: + post_preloads = await self.get_window_preloads(soup) + json_filename = self.get_filename_from_url(url, filetype=".post.json") + json_filepath = os.path.join(self.args.json_directory, json_filename) + _json = json.dumps(post_preloads, ensure_ascii=False, separators=(',', ':')) + self.save_to_file(json_filepath, _json) + # Convert markdown to HTML and save html_content = self.md_to_html(md) self.save_to_html_file(html_filepath, html_content) @@ -916,6 +928,12 @@ def parse_args() -> argparse.Namespace: default=BASE_IMAGE_DIR, help=f"The directory to save scraped image files. Default: {BASE_IMAGE_DIR!r}", ) + parser.add_argument( + "--json-directory", # args.json_directory + type=str, + default=BASE_JSON_DIR, + help=f"The directory to save scraped JSON files. Default: {BASE_JSON_DIR!r}", + ) parser.add_argument( "--no-images", # args.no_images action="store_true", @@ -926,6 +944,11 @@ def parse_args() -> argparse.Namespace: action="store_true", help=f"Do not download comments.", ) + parser.add_argument( + "--no-json", # args.no_json + action="store_true", + help=f"Do not write JSON files.", + ) return parser.parse_args() From f17ad508fff59408888cd0f3455a22382a698f5c Mon Sep 17 00:00:00 2001 From: Milan Hauth Date: Mon, 29 Dec 2025 15:29:46 +0100 Subject: [PATCH 12/28] use output filepath format strings --- src/substack2markdown/substack_scraper.py | 325 +++++++++++++--------- 1 file changed, 190 insertions(+), 135 deletions(-) diff --git a/src/substack2markdown/substack_scraper.py b/src/substack2markdown/substack_scraper.py index d31ca8dd..79a3c9c7 100644 --- a/src/substack2markdown/substack_scraper.py +++ b/src/substack2markdown/substack_scraper.py @@ -14,6 +14,7 @@ import asyncio import atexit import signal +import string import html2text import markdown @@ -34,9 +35,15 @@ BASE_JSON_DIR: str = "substack_json" ASSETS_DIR: str = os.path.dirname(__file__) + "/assets" HTML_TEMPLATE: str = "author_template.html" # HTML template to use for the author page -JSON_DATA_DIR: str = "data" NUM_POSTS_TO_SCRAPE: int = 3 # Set to 0 if you want all posts - +DEFAULT_OUTPUT_DIRECTORY_FORMAT = "$publication_domain" +DEFAULT_IMAGE_PATH_FORMAT = "p/$post_slug/images/$image_filename" +DEFAULT_MD_PATH_FORMAT = "p/$post_slug/readme.md" +DEFAULT_HTML_PATH_FORMAT = "p/$post_slug/index.html" +DEFAULT_POSTS_HTML_PATH_FORMAT = "index.html" +DEFAULT_POSTS_JSON_PATH_FORMAT = "posts.json" +DEFAULT_POST_JSON_PATH_FORMAT = "p/$post_slug/post.json" +DEFAULT_COMMENTS_JSON_PATH_FORMAT = "p/$post_slug/comments.json" def count_images_in_markdown(md_content: str) -> int: """Count number of Substack CDN image URLs in markdown content.""" @@ -80,37 +87,6 @@ def extract_main_part(url: str) -> str: # present -def generate_html_file(args, author_name: str) -> None: - """ - Generates a HTML file for the given author. - """ - if not os.path.exists(args.html_directory): - os.makedirs(args.html_directory) - - # Read JSON data - json_path = os.path.join(JSON_DATA_DIR, f'{author_name}.json') - with open(json_path, 'r', encoding='utf-8') as file: - essays_data = json.load(file) - - # Convert JSON data to a JSON string for embedding - embedded_json_data = json.dumps(essays_data, ensure_ascii=False, indent=4) - - with open(args.author_template, 'r', encoding='utf-8') as file: - html_template = file.read() - - # Insert the JSON string into the script tag in the HTML template - html_with_data = html_template.replace('', author_name).replace( - '', - f'' - ) - html_with_author = html_with_data.replace('author_name', author_name) - - # Write the modified HTML to a new file - html_output_path = os.path.join(args.html_directory, f'{author_name}.html') - with open(html_output_path, 'w', encoding='utf-8') as file: - file.write(html_with_author) - - class BaseSubstackScraper(ABC): def __await__(self): return self._async_init().__await__() @@ -121,32 +97,28 @@ async def __aenter__(self): async def __aexit__(self, exc_type, exc, tb): pass - def __init__(self, args, base_substack_url: str, md_save_dir: str, html_save_dir: str): - if not base_substack_url.endswith("/"): - base_substack_url += "/" + def __init__(self, args): self.args = args - self.base_substack_url: str = base_substack_url - - self.writer_name: str = extract_main_part(base_substack_url) - md_save_dir: str = f"{md_save_dir}/{self.writer_name}" + if not self.args.url.endswith("/"): + self.args.url += "/" - self.md_save_dir: str = md_save_dir - self.html_save_dir: str = f"{html_save_dir}/{self.writer_name}" + self.publication_handle: str = extract_main_part(self.args.url) - self.args.json_directory += f"/{self.writer_name}" + self.output_directory_template = string.Template(self.args.output_directory_format) - if not os.path.exists(md_save_dir): - os.makedirs(md_save_dir) - print(f"Created md directory {md_save_dir}") - if not os.path.exists(self.html_save_dir): - os.makedirs(self.html_save_dir) - print(f"Created html directory {self.html_save_dir}") + # all these paths are relative to output_directory + self.md_path_template = string.Template(self.args.md_path_format) + self.html_path_template = string.Template(self.args.html_path_format) + self.image_path_template = string.Template(self.args.image_path_format) + self.posts_html_path_template = string.Template(self.args.posts_html_path_format) + self.posts_json_path_template = string.Template(self.args.posts_json_path_format) + self.post_json_path_template = string.Template(self.args.post_json_path_format) + self.comments_json_path_template = string.Template(self.args.comments_json_path_format) - if not self.args.no_images: - os.makedirs(self.args.image_directory, exist_ok=True) - - if not self.args.no_json: - os.makedirs(self.args.json_directory, exist_ok=True) + self.format_vars = { + "publication_handle": self.publication_handle, + "publication_domain": f"{self.publication_handle}.substack.com", + } self.keywords: List[str] = ["about", "archive", "podcast"] self.post_urls: List[str] = self.get_all_post_urls() @@ -168,7 +140,7 @@ def fetch_urls_from_sitemap(self) -> List[str]: """ Fetches URLs from sitemap.xml. """ - sitemap_url = f"{self.base_substack_url}sitemap.xml" + sitemap_url = f"{self.args.url}sitemap.xml" response = requests.get(sitemap_url) if not response.ok: @@ -184,7 +156,7 @@ def fetch_urls_from_feed(self) -> List[str]: Fetches URLs from feed.xml. """ print('Falling back to feed.xml. This will only contain up to the 22 most recent posts.') - feed_url = f"{self.base_substack_url}feed.xml" + feed_url = f"{self.args.url}feed.xml" response = requests.get(feed_url) if not response.ok: @@ -258,7 +230,9 @@ def save_to_html_file(self, filepath: str, content: str) -> None: # Calculate the relative path from the HTML file to the CSS file html_dir = os.path.dirname(filepath) - css_path = os.path.relpath(self.args.assets_dir + "/css/essay-styles.css", html_dir) + css_path = self.args.assets_dir + "/css/essay-styles.css" + if not os.path.isabs(css_path): + css_path = os.path.relpath(css_path, html_dir) css_path = css_path.replace("\\", "/") # Ensure forward slashes for web paths html_content = f""" @@ -490,35 +464,55 @@ def render_comments_html_inner(comment, buf): def get_url_soup(self, url: str) -> str: raise NotImplementedError - def save_essays_data_to_json(self, essays_data: list) -> None: + def save_posts_data_json(self, posts_data: list) -> None: """ Saves essays data to a JSON file for a specific author. """ - data_dir = os.path.join(JSON_DATA_DIR) - if not os.path.exists(data_dir): - os.makedirs(data_dir) - - json_path = os.path.join(data_dir, f'{self.writer_name}.json') - if os.path.exists(json_path): - with open(json_path, 'r', encoding='utf-8') as file: + posts_json_path = os.path.join( + self.format_vars["output_directory"], + self.posts_json_path_template.substitute(self.format_vars) + ) + os.makedirs(os.path.dirname(posts_json_path), exist_ok=True) + if os.path.exists(posts_json_path): + with open(posts_json_path, 'r', encoding='utf-8') as file: existing_data = json.load(file) - essays_data = existing_data + [data for data in essays_data if data not in existing_data] - with open(json_path, 'w', encoding='utf-8') as f: - json.dump(essays_data, f, ensure_ascii=False, indent=4) + # remove duplicates from existing_data + new_post_ids = set(map(lambda p: p["id"], posts_data)) + existing_data = [p for p in posts_data if p["id"] not in new_post_ids] + posts_data = existing_data + posts_data + # sort by post_id, descending + posts_data.sort(key=lambda p: -1*p["id"]) + with open(posts_json_path, 'w', encoding='utf-8') as f: + json.dump(posts_data, f, ensure_ascii=False, separators=(',', ':')) async def scrape_posts(self, num_posts_to_scrape: int = 0) -> None: """ Iterates over all posts and saves them as markdown and html files """ - essays_data = [] + posts_data = [] count = 0 total = num_posts_to_scrape if num_posts_to_scrape != 0 else len(self.post_urls) for url in tqdm(self.post_urls, total=total): try: - md_filename = self.get_filename_from_url(url, filetype=".md") - html_filename = self.get_filename_from_url(url, filetype=".html") - md_filepath = os.path.join(self.md_save_dir, md_filename) - html_filepath = os.path.join(self.html_save_dir, html_filename) + post_slug = url.split("/")[-1] + self.format_vars["post_slug"] = post_slug + + output_directory = self.output_directory_template.substitute(self.format_vars) + self.format_vars["output_directory"] = output_directory + + md_filepath = os.path.join( + output_directory, + self.md_path_template.substitute(self.format_vars) + ) + self.format_vars["md_filepath"] = md_filepath + self.format_vars["md_directory"] = os.path.dirname(md_filepath) + + html_filepath = os.path.join( + output_directory, + self.html_path_template.substitute(self.format_vars) + ) + self.format_vars["html_filepath"] = html_filepath + self.format_vars["html_directory"] = os.path.dirname(html_filepath) # if not os.path.exists(md_filepath): if True: @@ -527,12 +521,14 @@ async def scrape_posts(self, num_posts_to_scrape: int = 0) -> None: total += 1 continue title, subtitle, like_count, date, md = self.extract_post_data(soup) + post_preloads = await self.get_window_preloads(soup) + + post_id = post_preloads["post"]["id"] if not self.args.no_images: total_images = count_images_in_markdown(md) - post_slug = get_post_slug(url) with tqdm(total=total_images, desc=f"Downloading images for {post_slug}", leave=False) as img_pbar: - md = await self.process_markdown_images(md, self.writer_name, post_slug, img_pbar) + md = await self.process_markdown_images(md, img_pbar) comments_html = None comments_num = None @@ -542,8 +538,10 @@ async def scrape_posts(self, num_posts_to_scrape: int = 0) -> None: comments_soup = await self.get_url_soup(comments_url) comments_preloads = await self.get_window_preloads(comments_soup) if not self.args.no_json: - json_filename = self.get_filename_from_url(url, filetype=".comments.json") - json_filepath = os.path.join(self.args.json_directory, json_filename) + json_filepath = os.path.join( + output_directory, + self.comments_json_path_template.substitute(self.format_vars) + ) _json = json.dumps(comments_preloads, ensure_ascii=False, separators=(',', ':')) self.save_to_file(json_filepath, _json) comments_num = self.count_comments(comments_preloads) @@ -567,9 +565,10 @@ async def scrape_posts(self, num_posts_to_scrape: int = 0) -> None: self.save_to_file(md_filepath, md) if not self.args.no_json: - post_preloads = await self.get_window_preloads(soup) - json_filename = self.get_filename_from_url(url, filetype=".post.json") - json_filepath = os.path.join(self.args.json_directory, json_filename) + json_filepath = os.path.join( + output_directory, + self.post_json_path_template.substitute(self.format_vars) + ) _json = json.dumps(post_preloads, ensure_ascii=False, separators=(',', ':')) self.save_to_file(json_filepath, _json) @@ -577,7 +576,8 @@ async def scrape_posts(self, num_posts_to_scrape: int = 0) -> None: html_content = self.md_to_html(md) self.save_to_html_file(html_filepath, html_content) - essays_data.append({ + posts_data.append({ + "id": post_id, "title": title, "subtitle": subtitle, "like_count": like_count, @@ -594,8 +594,51 @@ async def scrape_posts(self, num_posts_to_scrape: int = 0) -> None: count += 1 if num_posts_to_scrape != 0 and count == num_posts_to_scrape: break - self.save_essays_data_to_json(essays_data=essays_data) - generate_html_file(self.args, author_name=self.writer_name) + self.save_posts_data_json(posts_data) + self.generate_main_html_file() + + def generate_main_html_file(self) -> None: + """ + Generates a HTML file for the given author. + """ + # Read JSON data + posts_json_path = os.path.join( + self.format_vars["output_directory"], + self.posts_json_path_template.substitute(self.format_vars) + ) + with open(posts_json_path, 'r', encoding='utf-8') as file: + posts_data = json.load(file) + + # Convert JSON data to a JSON string for embedding + embedded_json_data = json.dumps(posts_data, ensure_ascii=False, separators=(',', ':')) + + html_output_path = os.path.join( + self.format_vars["output_directory"], + self.posts_html_path_template.substitute(self.format_vars) + ) + + with open(self.args.author_template, 'r', encoding='utf-8') as file: + html_template = file.read() + + html_with_data = html_template + + # patch assets path + assets_path = self.args.assets_dir + if not os.path.isabs(assets_path): + assets_path = os.path.relpath(assets_path, os.path.dirname(html_output_path)) + html_with_data = html_with_data.replace('"../assets', f'"{assets_path}') + + html_with_data = html_with_data.replace('', self.publication_handle) + + # Insert the JSON string into the script tag in the HTML template + html_with_data = html_with_data.replace( + '', + f'' + ) + + # Write the modified HTML to a new file + with open(html_output_path, 'w', encoding='utf-8') as file: + file.write(html_with_data) async def download_image( self, @@ -624,12 +667,10 @@ async def download_image( async def process_markdown_images( self, md_content: str, - author: str, - post_slug: str, pbar=None ) -> str: """Process markdown content to download images and update references.""" - image_dir = Path(self.args.image_directory) / author / post_slug + output_directory = self.format_vars["output_directory"] # [![](https://substackcdn.com/image/fetch/x.png)](https://substackcdn.com/image/fetch/x.png) pattern = re.compile(r'\(https://substackcdn\.com/image/fetch/[^\s\)]+\)') buf = io.StringIO() @@ -638,10 +679,20 @@ async def process_markdown_images( buf.write(md_content[last_end:match.start()]) url = match.group(0).strip("()") filename = sanitize_image_filename(url) - save_path = image_dir / filename + format_vars = { + **self.format_vars, + "image_filename": filename, + } + save_path = Path(os.path.join( + output_directory, + self.image_path_template.substitute(format_vars) + )) if not save_path.exists(): await self.download_image(url, save_path, pbar) - rel_path = os.path.relpath(save_path, Path(self.args.directory) / author) + md_directory = self.format_vars["md_directory"] + rel_path = save_path + if not os.path.isabs(rel_path): + rel_path = os.path.relpath(save_path, md_directory) buf.write(f"({rel_path})") last_end = match.end() buf.write(md_content[last_end:]) @@ -649,8 +700,8 @@ async def process_markdown_images( class SubstackScraper(BaseSubstackScraper): - def __init__(self, args, base_substack_url: str, md_save_dir: str, html_save_dir: str): - super().__init__(args, base_substack_url, md_save_dir, html_save_dir) + def __init__(self, args): + super().__init__(args, self.args.url) def get_url_soup(self, url: str) -> Optional[BeautifulSoup]: """ @@ -668,17 +719,8 @@ def get_url_soup(self, url: str) -> Optional[BeautifulSoup]: class PremiumSubstackScraper(BaseSubstackScraper): - def __init__( - self, - args, - base_substack_url: str, - md_save_dir: str, - html_save_dir: str, - headless: bool = False, - chromium_path: str = '', - user_agent: str = '' - ) -> None: - super().__init__(args, base_substack_url, md_save_dir, html_save_dir) + def __init__(self, args) -> None: + super().__init__(args) self.driver = None @@ -698,13 +740,13 @@ def exit_handler(signum, frame): options = webdriver.ChromeOptions() self.chrome_options = options - if headless: + if self.args.headless: # modern headless flag (works better with recent Chromium) options.add_argument("--headless=new") - if chromium_path: - options.binary_location = chromium_path - if user_agent: - options.add_argument(f"user-agent={user_agent}") + if self.args.chromium_path: + options.binary_location = self.args.chromium_path + if self.args.user_agent: + options.add_argument(f"user-agent={self.args.user_agent}") async def _async_init(self): self._loop = asyncio.get_running_loop() @@ -868,13 +910,6 @@ def parse_args() -> argparse.Namespace: default=BASE_SUBSTACK_URL, help="The base URL of the Substack site to scrape." ) - parser.add_argument( - "-d", - "--directory", # args.directory - type=str, - default=BASE_MD_DIR, - help="The directory to save scraped posts." - ) parser.add_argument( "-n", "--number", # args.number @@ -917,22 +952,53 @@ def parse_args() -> argparse.Namespace: "passing captcha in headless mode", ) parser.add_argument( - "--html-directory", # args.html_directory + "--output-directory-format", # args.output_directory_format + type=str, + default=DEFAULT_OUTPUT_DIRECTORY_FORMAT, + # all relative output file paths are relative to this directory + help=f"The file path format of the directory to save output files. Default: {DEFAULT_OUTPUT_DIRECTORY_FORMAT!r}", + ) + parser.add_argument( + "--md-path-format", # args.md_path_format + type=str, + default=DEFAULT_MD_PATH_FORMAT, + help=f"The file path format to save scraped posts as Markdown files. Default: {DEFAULT_MD_PATH_FORMAT!r}", + ) + parser.add_argument( + "--html-path-format", # args.html_path_format + type=str, + default=DEFAULT_HTML_PATH_FORMAT, + help=f"The file path format to save scraped posts as HTML files. Default: {DEFAULT_HTML_PATH_FORMAT!r}", + ) + parser.add_argument( + "--image-path-format", # args.image_path_format + type=str, + default=DEFAULT_IMAGE_PATH_FORMAT, + help=f"The file path format to save scraped image files. Default: {DEFAULT_IMAGE_PATH_FORMAT!r}", + ) + parser.add_argument( + "--posts-html-path-format", # args.posts_html_path_format + type=str, + default=DEFAULT_POSTS_HTML_PATH_FORMAT, + help=f"The file path format to save an index of scraped posts as HTML file. Default: {DEFAULT_POSTS_HTML_PATH_FORMAT!r}", + ) + parser.add_argument( + "--posts-json-path-format", # args.posts_json_path_format type=str, - default=BASE_HTML_DIR, - help=f"The directory to save scraped posts as HTML files. Default: {BASE_HTML_DIR!r}", + default=DEFAULT_POSTS_JSON_PATH_FORMAT, + help=f"The file path format to save metadata of scraped posts as JSON file. Default: {DEFAULT_POSTS_JSON_PATH_FORMAT!r}", ) parser.add_argument( - "--image-directory", # args.image_directory + "--post-json-path-format", # args.post_json_path_format type=str, - default=BASE_IMAGE_DIR, - help=f"The directory to save scraped image files. Default: {BASE_IMAGE_DIR!r}", + default=DEFAULT_POST_JSON_PATH_FORMAT, + help=f"The file path format to save scraped posts as JSON files. Default: {DEFAULT_POST_JSON_PATH_FORMAT!r}", ) parser.add_argument( - "--json-directory", # args.json_directory + "--comments-json-path-format", # args.comments_json_path_format type=str, - default=BASE_JSON_DIR, - help=f"The directory to save scraped JSON files. Default: {BASE_JSON_DIR!r}", + default=DEFAULT_COMMENTS_JSON_PATH_FORMAT, + help=f"The file path format to save scraped comments as JSON files. Default: {DEFAULT_COMMENTS_JSON_PATH_FORMAT!r}", ) parser.add_argument( "--no-images", # args.no_images @@ -971,20 +1037,9 @@ async def async_main(): if True: if args.premium: - scraper = await PremiumSubstackScraper( - args=args, - base_substack_url=args.url, - headless=args.headless, - md_save_dir=args.directory, - html_save_dir=args.html_directory - ) + scraper = await PremiumSubstackScraper(args) else: - scraper = await SubstackScraper( - args=args, - base_substack_url=args.url, - md_save_dir=args.directory, - html_save_dir=args.html_directory - ) + scraper = await SubstackScraper(args) await scraper.scrape_posts(args.number) await scraper.close() From 38875f844e689c5f865c862c3829bdcae94c0e79 Mon Sep 17 00:00:00 2001 From: Milan Hauth Date: Mon, 29 Dec 2025 16:29:45 +0100 Subject: [PATCH 13/28] add json_dump_kwargs --- src/substack2markdown/substack_scraper.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/src/substack2markdown/substack_scraper.py b/src/substack2markdown/substack_scraper.py index 79a3c9c7..dd151b9e 100644 --- a/src/substack2markdown/substack_scraper.py +++ b/src/substack2markdown/substack_scraper.py @@ -45,6 +45,12 @@ DEFAULT_POST_JSON_PATH_FORMAT = "p/$post_slug/post.json" DEFAULT_COMMENTS_JSON_PATH_FORMAT = "p/$post_slug/comments.json" +json_dump_kwargs = dict( + ensure_ascii=False, + indent=0, + separators=(',', ':'), +) + def count_images_in_markdown(md_content: str) -> int: """Count number of Substack CDN image URLs in markdown content.""" # [![](https://substackcdn.com/image/fetch/x.png)](https://substackcdn.com/image/fetch/x.png) @@ -483,7 +489,7 @@ def save_posts_data_json(self, posts_data: list) -> None: # sort by post_id, descending posts_data.sort(key=lambda p: -1*p["id"]) with open(posts_json_path, 'w', encoding='utf-8') as f: - json.dump(posts_data, f, ensure_ascii=False, separators=(',', ':')) + json.dump(posts_data, f, **json_dump_kwargs) async def scrape_posts(self, num_posts_to_scrape: int = 0) -> None: """ @@ -542,7 +548,7 @@ async def scrape_posts(self, num_posts_to_scrape: int = 0) -> None: output_directory, self.comments_json_path_template.substitute(self.format_vars) ) - _json = json.dumps(comments_preloads, ensure_ascii=False, separators=(',', ':')) + _json = json.dumps(comments_preloads, **json_dump_kwargs) self.save_to_file(json_filepath, _json) comments_num = self.count_comments(comments_preloads) if comments_num > 0: @@ -569,7 +575,7 @@ async def scrape_posts(self, num_posts_to_scrape: int = 0) -> None: output_directory, self.post_json_path_template.substitute(self.format_vars) ) - _json = json.dumps(post_preloads, ensure_ascii=False, separators=(',', ':')) + _json = json.dumps(post_preloads, **json_dump_kwargs) self.save_to_file(json_filepath, _json) # Convert markdown to HTML and save @@ -610,7 +616,7 @@ def generate_main_html_file(self) -> None: posts_data = json.load(file) # Convert JSON data to a JSON string for embedding - embedded_json_data = json.dumps(posts_data, ensure_ascii=False, separators=(',', ':')) + embedded_json_data = json.dumps(posts_data, **json_dump_kwargs) html_output_path = os.path.join( self.format_vars["output_directory"], From 2ef534c6961c1be88df4f8d8bd14df1b467593f9 Mon Sep 17 00:00:00 2001 From: Milan Hauth Date: Tue, 30 Dec 2025 12:08:39 +0100 Subject: [PATCH 14/28] fix class SubstackScraper --- src/substack2markdown/substack_scraper.py | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/src/substack2markdown/substack_scraper.py b/src/substack2markdown/substack_scraper.py index dd151b9e..835ca5bd 100644 --- a/src/substack2markdown/substack_scraper.py +++ b/src/substack2markdown/substack_scraper.py @@ -101,6 +101,9 @@ async def __aenter__(self): return await self async def __aexit__(self, exc_type, exc, tb): + await self.close() + + async def close(self): pass def __init__(self, args): @@ -706,10 +709,7 @@ async def process_markdown_images( class SubstackScraper(BaseSubstackScraper): - def __init__(self, args): - super().__init__(args, self.args.url) - - def get_url_soup(self, url: str) -> Optional[BeautifulSoup]: + async def get_url_soup(self, url: str) -> Optional[BeautifulSoup]: """ Gets soup from URL using requests """ @@ -764,9 +764,6 @@ async def _async_init(self): async def _start_driver(self): self.driver = await webdriver.Chrome(options=self.chrome_options) - async def __aexit__(self, exc_type, exc, tb): - await self.close() - async def close(self) -> None: if self.driver: await self.driver.quit() From ad84f46ddc3a9defc82e4a2d5f29b8c84534efb2 Mon Sep 17 00:00:00 2001 From: Milan Hauth Date: Tue, 30 Dec 2025 12:09:07 +0100 Subject: [PATCH 15/28] add parameter offline --- src/substack2markdown/substack_scraper.py | 81 +++++++++++++++++++++-- 1 file changed, 75 insertions(+), 6 deletions(-) diff --git a/src/substack2markdown/substack_scraper.py b/src/substack2markdown/substack_scraper.py index 835ca5bd..45cbea05 100644 --- a/src/substack2markdown/substack_scraper.py +++ b/src/substack2markdown/substack_scraper.py @@ -140,11 +140,32 @@ def get_all_post_urls(self) -> List[str]: """ Attempts to fetch URLs from sitemap.xml, falling back to feed.xml if necessary. """ + if self.args.offline: + return self.get_all_post_urls_offline() urls = self.fetch_urls_from_sitemap() if not urls: urls = self.fetch_urls_from_feed() return self.filter_urls(urls, self.keywords) + def get_all_post_urls_offline(self) -> List[str]: + # Read JSON data + # NOTE this assumes that $post_slug is not used in args.output_directory_format + # because post_slug is undefined at this point + output_directory = self.output_directory_template.substitute(self.format_vars) + self.format_vars["output_directory"] = output_directory + posts_json_path = os.path.join( + # self.format_vars["output_directory"] = + self.format_vars["output_directory"], + self.posts_json_path_template.substitute(self.format_vars) + ) + with open(posts_json_path, 'r', encoding='utf-8') as file: + posts_data = json.load(file) + urls = [] + for post in posts_data: + post["slug"] = post["html_link"].split("/")[-2] # FIXME remove + urls.append(self.args.url + "p/" + post["slug"]) + return urls + def fetch_urls_from_sitemap(self) -> List[str]: """ Fetches URLs from sitemap.xml. @@ -352,6 +373,27 @@ def extract_post_data(self, soup: BeautifulSoup) -> Tuple[str, str, str, str, st return title, subtitle, like_count, date, md_content + def extract_post_data_from_preloads(self, post_preloads): + + title = post_preloads["post"]["title"] + + subtitle = post_preloads["post"]["description"] + + like_count = post_preloads["post"]["reactions"]["❤"] + + # TODO expose date format + datetime_format = "%b %d, %Y" # "Oct 01, 2025" + + date = post_preloads["post"]["post_date"] # date in ISO format: "2025-10-01T14:43:48.389Z" + date = datetime.strptime(date, "%Y-%m-%dT%H:%M:%S.%fZ").strftime(datetime_format) + + content_html = post_preloads["post"]["body_html"] + md = self.html_to_md(content_html) + # Combine metadata + content + md_content = self.combine_metadata_and_content(title, subtitle, date, like_count, md) + + return title, subtitle, like_count, date, md_content + async def get_window_preloads(self, soup): # all comments are stored in javascript # @@ -470,7 +512,7 @@ def render_comments_html_inner(comment, buf): return buf.getvalue() @abstractmethod - def get_url_soup(self, url: str) -> str: + async def get_url_soup(self, url: str) -> str: raise NotImplementedError def save_posts_data_json(self, posts_data: list) -> None: @@ -524,7 +566,15 @@ async def scrape_posts(self, num_posts_to_scrape: int = 0) -> None: self.format_vars["html_directory"] = os.path.dirname(html_filepath) # if not os.path.exists(md_filepath): - if True: + if self.args.offline: + json_filepath = os.path.join( + output_directory, + self.post_json_path_template.substitute(self.format_vars) + ) + with open(json_filepath) as f: + post_preloads = json.load(f) + title, subtitle, like_count, date, md = self.extract_post_data_from_preloads(post_preloads) + else: soup = await self.get_url_soup(url) if soup is None: total += 1 @@ -532,20 +582,31 @@ async def scrape_posts(self, num_posts_to_scrape: int = 0) -> None: title, subtitle, like_count, date, md = self.extract_post_data(soup) post_preloads = await self.get_window_preloads(soup) + if True: post_id = post_preloads["post"]["id"] + if True: if not self.args.no_images: total_images = count_images_in_markdown(md) with tqdm(total=total_images, desc=f"Downloading images for {post_slug}", leave=False) as img_pbar: md = await self.process_markdown_images(md, img_pbar) + if True: comments_html = None comments_num = None if not self.args.no_comments: comments_url = url + "/comments" # comments_url = "https://willstorr.substack.com/p/scamming-substack/comments" # test - comments_soup = await self.get_url_soup(comments_url) - comments_preloads = await self.get_window_preloads(comments_soup) + if self.args.offline: + json_filepath = os.path.join( + output_directory, + self.comments_json_path_template.substitute(self.format_vars) + ) + with open(json_filepath) as f: + comments_preloads = json.load(f) + else: + comments_soup = await self.get_url_soup(comments_url) + comments_preloads = await self.get_window_preloads(comments_soup) if not self.args.no_json: json_filepath = os.path.join( output_directory, @@ -587,6 +648,7 @@ async def scrape_posts(self, num_posts_to_scrape: int = 0) -> None: posts_data.append({ "id": post_id, + "slug": post_preloads["post"]["slug"], "title": title, "subtitle": subtitle, "like_count": like_count, @@ -696,7 +758,7 @@ async def process_markdown_images( output_directory, self.image_path_template.substitute(format_vars) )) - if not save_path.exists(): + if not save_path.exists() and not self.args.offline: await self.download_image(url, save_path, pbar) md_directory = self.format_vars["md_directory"] rel_path = save_path @@ -920,6 +982,11 @@ def parse_args() -> argparse.Namespace: default=0, help="The number of posts to scrape. If 0 or not provided, all posts will be scraped.", ) + parser.add_argument( + "--offline", # args.offline + action="store_true", + help="Use existing JSON files to render Markdown and HTML files.", + ) parser.add_argument( "-p", "--premium", @@ -1039,7 +1106,9 @@ async def async_main(): args.author_template = args.assets_dir + "/" + HTML_TEMPLATE if True: - if args.premium: + if args.offline: + scraper = await SubstackScraper(args) + elif args.premium: scraper = await PremiumSubstackScraper(args) else: scraper = await SubstackScraper(args) From 94192ca5dbc6131b1ff8f258c5c5e6b8c9cb7c6e Mon Sep 17 00:00:00 2001 From: Milan Hauth Date: Tue, 30 Dec 2025 12:22:14 +0100 Subject: [PATCH 16/28] fix paths in posts.json --- src/substack2markdown/substack_scraper.py | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/src/substack2markdown/substack_scraper.py b/src/substack2markdown/substack_scraper.py index 45cbea05..341e4cb6 100644 --- a/src/substack2markdown/substack_scraper.py +++ b/src/substack2markdown/substack_scraper.py @@ -149,8 +149,6 @@ def get_all_post_urls(self) -> List[str]: def get_all_post_urls_offline(self) -> List[str]: # Read JSON data - # NOTE this assumes that $post_slug is not used in args.output_directory_format - # because post_slug is undefined at this point output_directory = self.output_directory_template.substitute(self.format_vars) self.format_vars["output_directory"] = output_directory posts_json_path = os.path.join( @@ -540,6 +538,15 @@ async def scrape_posts(self, num_posts_to_scrape: int = 0) -> None: """ Iterates over all posts and saves them as markdown and html files """ + output_directory = self.output_directory_template.substitute(self.format_vars) + self.format_vars["output_directory"] = output_directory + + posts_json_path = os.path.join( + self.format_vars["output_directory"], + self.posts_json_path_template.substitute(self.format_vars) + ) + posts_json_dir = os.path.dirname(posts_json_path) + posts_data = [] count = 0 total = num_posts_to_scrape if num_posts_to_scrape != 0 else len(self.post_urls) @@ -548,9 +555,6 @@ async def scrape_posts(self, num_posts_to_scrape: int = 0) -> None: post_slug = url.split("/")[-1] self.format_vars["post_slug"] = post_slug - output_directory = self.output_directory_template.substitute(self.format_vars) - self.format_vars["output_directory"] = output_directory - md_filepath = os.path.join( output_directory, self.md_path_template.substitute(self.format_vars) @@ -654,8 +658,8 @@ async def scrape_posts(self, num_posts_to_scrape: int = 0) -> None: "like_count": like_count, "comment_count": comments_num, "date": date, - "file_link": md_filepath, - "html_link": html_filepath + "file_link": os.path.relpath(md_filepath, posts_json_dir), + "html_link": os.path.relpath(html_filepath, posts_json_dir), }) else: print(f"File already exists: {md_filepath}") From 6cccf75c955387be805f69324100d2e8a947cedd Mon Sep 17 00:00:00 2001 From: Milan Hauth Date: Tue, 30 Dec 2025 13:02:48 +0100 Subject: [PATCH 17/28] add repost_count to posts_data --- src/substack2markdown/substack_scraper.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/substack2markdown/substack_scraper.py b/src/substack2markdown/substack_scraper.py index 341e4cb6..7f3fa9ca 100644 --- a/src/substack2markdown/substack_scraper.py +++ b/src/substack2markdown/substack_scraper.py @@ -657,6 +657,7 @@ async def scrape_posts(self, num_posts_to_scrape: int = 0) -> None: "subtitle": subtitle, "like_count": like_count, "comment_count": comments_num, + "repost_count": post_preloads["post"]["restacks"], "date": date, "file_link": os.path.relpath(md_filepath, posts_json_dir), "html_link": os.path.relpath(html_filepath, posts_json_dir), From d311b56756c102b0534eb8e94a3bae5b943321c6 Mon Sep 17 00:00:00 2001 From: Milan Hauth Date: Tue, 30 Dec 2025 13:19:25 +0100 Subject: [PATCH 18/28] add post_json to posts_data --- src/substack2markdown/substack_scraper.py | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/src/substack2markdown/substack_scraper.py b/src/substack2markdown/substack_scraper.py index 7f3fa9ca..c23e5a75 100644 --- a/src/substack2markdown/substack_scraper.py +++ b/src/substack2markdown/substack_scraper.py @@ -569,6 +569,18 @@ async def scrape_posts(self, num_posts_to_scrape: int = 0) -> None: self.format_vars["html_filepath"] = html_filepath self.format_vars["html_directory"] = os.path.dirname(html_filepath) + post_json_filepath = None + comments_json_filepath = None + if not self.args.no_json: + post_json_filepath = os.path.join( + output_directory, + self.post_json_path_template.substitute(self.format_vars) + ) + comments_json_filepath = os.path.join( + output_directory, + self.comments_json_path_template.substitute(self.format_vars) + ) + # if not os.path.exists(md_filepath): if self.args.offline: json_filepath = os.path.join( @@ -650,7 +662,7 @@ async def scrape_posts(self, num_posts_to_scrape: int = 0) -> None: html_content = self.md_to_html(md) self.save_to_html_file(html_filepath, html_content) - posts_data.append({ + post = { "id": post_id, "slug": post_preloads["post"]["slug"], "title": title, @@ -661,7 +673,13 @@ async def scrape_posts(self, num_posts_to_scrape: int = 0) -> None: "date": date, "file_link": os.path.relpath(md_filepath, posts_json_dir), "html_link": os.path.relpath(html_filepath, posts_json_dir), - }) + } + + if not self.args.no_json: + post["post_json"] = os.path.relpath(post_json_filepath, posts_json_dir) + post["comments_json"] = os.path.relpath(comments_json_filepath, posts_json_dir) + + posts_data.append(post) else: print(f"File already exists: {md_filepath}") except Exception as e: From d7ff58243ef64e41982f96b7fc2c20d0672ff9ca Mon Sep 17 00:00:00 2001 From: Milan Hauth Date: Tue, 30 Dec 2025 13:36:30 +0100 Subject: [PATCH 19/28] add generate_main_md_file --- src/substack2markdown/substack_scraper.py | 82 +++++++++++++++++++++++ 1 file changed, 82 insertions(+) diff --git a/src/substack2markdown/substack_scraper.py b/src/substack2markdown/substack_scraper.py index c23e5a75..64fed313 100644 --- a/src/substack2markdown/substack_scraper.py +++ b/src/substack2markdown/substack_scraper.py @@ -40,6 +40,7 @@ DEFAULT_IMAGE_PATH_FORMAT = "p/$post_slug/images/$image_filename" DEFAULT_MD_PATH_FORMAT = "p/$post_slug/readme.md" DEFAULT_HTML_PATH_FORMAT = "p/$post_slug/index.html" +DEFAULT_POSTS_MD_PATH_FORMAT = "readme.md" DEFAULT_POSTS_HTML_PATH_FORMAT = "index.html" DEFAULT_POSTS_JSON_PATH_FORMAT = "posts.json" DEFAULT_POST_JSON_PATH_FORMAT = "p/$post_slug/post.json" @@ -119,6 +120,7 @@ def __init__(self, args): self.md_path_template = string.Template(self.args.md_path_format) self.html_path_template = string.Template(self.args.html_path_format) self.image_path_template = string.Template(self.args.image_path_format) + self.posts_md_path_template = string.Template(self.args.posts_md_path_format) self.posts_html_path_template = string.Template(self.args.posts_html_path_format) self.posts_json_path_template = string.Template(self.args.posts_json_path_format) self.post_json_path_template = string.Template(self.args.post_json_path_format) @@ -689,8 +691,77 @@ async def scrape_posts(self, num_posts_to_scrape: int = 0) -> None: if num_posts_to_scrape != 0 and count == num_posts_to_scrape: break self.save_posts_data_json(posts_data) + self.generate_main_md_file() self.generate_main_html_file() + def generate_main_md_file(self) -> None: + """ + Generates a Markdown file for the given author. + """ + # Read JSON data + posts_json_path = os.path.join( + self.format_vars["output_directory"], + self.posts_json_path_template.substitute(self.format_vars) + ) + with open(posts_json_path, 'r', encoding='utf-8') as file: + posts_data = json.load(file) + + # sort by post_id, descending + posts_data.sort(key=lambda p: -1*p["id"]) + + last_post = posts_data[0] + last_post_json_path = last_post["post_json"] + last_post_json_path = os.path.join( + os.path.dirname(posts_json_path), + last_post_json_path + ) + + with open(last_post_json_path, 'r', encoding='utf-8') as file: + last_post = json.load(file) + + publication = last_post["pub"] + + md_output_path = os.path.join( + self.format_vars["output_directory"], + self.posts_md_path_template.substitute(self.format_vars) + ) + + with open(md_output_path, 'w', encoding='utf-8') as file: + file.write(f'# {publication["name"]}\n') + file.write('\n') + # author_url = f'https://substack.com/@{publication["author_handle"]}' # variable + author_url = f'https://substack.com/profile/{publication["author_id"]}' # constant + file.write(f'by [{publication["author_name"]}]({author_url})\n') + file.write('\n') + author_bio = publication["author_bio"].replace("\n", "\n\n") + file.write(f'{author_bio}\n') + file.write('\n') + file.write('\n') + file.write('\n') + file.write('## Posts\n') + file.write('\n') + for post in posts_data: + # TODO use args.datetime_format + post_date = post["date"] + post_link = ( + '' + + post["title"].replace('<', '<') + + '' + ) + if post["like_count"] > 0: + post_link += f" ❤" + str(post["like_count"]) # "❤123" + if post["comment_count"] > 0: + post_link += f" 🗨" + str(post["comment_count"]) # "🗨123" + if post["repost_count"] > 0: + post_link += f" ↻" + str(post["repost_count"]) # "↻123" + file.write(f'- {post_date} - {post_link}\n') + def generate_main_html_file(self) -> None: """ Generates a HTML file for the given author. @@ -706,6 +777,11 @@ def generate_main_html_file(self) -> None: # Convert JSON data to a JSON string for embedding embedded_json_data = json.dumps(posts_data, **json_dump_kwargs) + md_output_path = os.path.join( + self.format_vars["output_directory"], + self.posts_md_path_template.substitute(self.format_vars) + ) + html_output_path = os.path.join( self.format_vars["output_directory"], self.posts_html_path_template.substitute(self.format_vars) @@ -1069,6 +1145,12 @@ def parse_args() -> argparse.Namespace: default=DEFAULT_IMAGE_PATH_FORMAT, help=f"The file path format to save scraped image files. Default: {DEFAULT_IMAGE_PATH_FORMAT!r}", ) + parser.add_argument( + "--posts-md-path-format", # args.posts_md_path_format + type=str, + default=DEFAULT_POSTS_MD_PATH_FORMAT, + help=f"The file path format to save an index of scraped posts as Markdown file. Default: {DEFAULT_POSTS_MD_PATH_FORMAT!r}", + ) parser.add_argument( "--posts-html-path-format", # args.posts_html_path_format type=str, From cbf9cbefab76db8329ef66f6af0d48d683049bcd Mon Sep 17 00:00:00 2001 From: Milan Hauth Date: Tue, 30 Dec 2025 13:38:24 +0100 Subject: [PATCH 20/28] fix post links: remove ../ prefix --- src/substack2markdown/assets/js/populate-essays.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/substack2markdown/assets/js/populate-essays.js b/src/substack2markdown/assets/js/populate-essays.js index 0c700383..4bca7708 100644 --- a/src/substack2markdown/assets/js/populate-essays.js +++ b/src/substack2markdown/assets/js/populate-essays.js @@ -19,7 +19,7 @@ function populateEssays(data) { const essaysContainer = document.getElementById('essays-container'); const list = data.map(essay => `
  • - ${essay.title} + ${essay.title}
    ${essay.subtitle}
  • From 8d24a803978309e025d52a814a3e7ef2b88f2718 Mon Sep 17 00:00:00 2001 From: Milan Hauth Date: Tue, 30 Dec 2025 13:40:44 +0100 Subject: [PATCH 21/28] fix post links: remove target="_blank" --- src/substack2markdown/assets/js/populate-essays.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/substack2markdown/assets/js/populate-essays.js b/src/substack2markdown/assets/js/populate-essays.js index 4bca7708..4c2249e1 100644 --- a/src/substack2markdown/assets/js/populate-essays.js +++ b/src/substack2markdown/assets/js/populate-essays.js @@ -19,7 +19,7 @@ function populateEssays(data) { const essaysContainer = document.getElementById('essays-container'); const list = data.map(essay => `
  • - ${essay.title} + ${essay.title}
    ${essay.subtitle}
  • From c138b87bcb6ee8b440a2c6ef8ca97bb3fd5b3b02 Mon Sep 17 00:00:00 2001 From: Milan Hauth Date: Tue, 30 Dec 2025 15:34:16 +0100 Subject: [PATCH 22/28] store ISO format date in posts.json --- src/substack2markdown/substack_scraper.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/substack2markdown/substack_scraper.py b/src/substack2markdown/substack_scraper.py index 64fed313..18f6a79c 100644 --- a/src/substack2markdown/substack_scraper.py +++ b/src/substack2markdown/substack_scraper.py @@ -381,11 +381,10 @@ def extract_post_data_from_preloads(self, post_preloads): like_count = post_preloads["post"]["reactions"]["❤"] - # TODO expose date format - datetime_format = "%b %d, %Y" # "Oct 01, 2025" - date = post_preloads["post"]["post_date"] # date in ISO format: "2025-10-01T14:43:48.389Z" - date = datetime.strptime(date, "%Y-%m-%dT%H:%M:%S.%fZ").strftime(datetime_format) + + # datetime_format = "%b %d, %Y" # "Oct 01, 2025" + # date = datetime.strptime(date, "%Y-%m-%dT%H:%M:%S.%fZ").strftime(datetime_format) content_html = post_preloads["post"]["body_html"] md = self.html_to_md(content_html) @@ -599,6 +598,7 @@ async def scrape_posts(self, num_posts_to_scrape: int = 0) -> None: continue title, subtitle, like_count, date, md = self.extract_post_data(soup) post_preloads = await self.get_window_preloads(soup) + date = post_preloads["post"]["post_date"] # date in ISO format: "2025-10-01T14:43:48.389Z" if True: post_id = post_preloads["post"]["id"] From 2acf823b707c49816d433a0159a06fc418e79a47 Mon Sep 17 00:00:00 2001 From: Milan Hauth Date: Tue, 30 Dec 2025 15:55:09 +0100 Subject: [PATCH 23/28] fix regex pattern in process_markdown_images --- src/substack2markdown/substack_scraper.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/substack2markdown/substack_scraper.py b/src/substack2markdown/substack_scraper.py index 18f6a79c..7954ddc6 100644 --- a/src/substack2markdown/substack_scraper.py +++ b/src/substack2markdown/substack_scraper.py @@ -842,12 +842,12 @@ async def process_markdown_images( """Process markdown content to download images and update references.""" output_directory = self.format_vars["output_directory"] # [![](https://substackcdn.com/image/fetch/x.png)](https://substackcdn.com/image/fetch/x.png) - pattern = re.compile(r'\(https://substackcdn\.com/image/fetch/[^\s\)]+\)') + pattern = re.compile(r'\((https://substackcdn\.com/image/fetch/[^\s\)]+)\)') buf = io.StringIO() last_end = 0 for match in pattern.finditer(md_content): buf.write(md_content[last_end:match.start()]) - url = match.group(0).strip("()") + url = match.group(1) filename = sanitize_image_filename(url) format_vars = { **self.format_vars, From e5b0bdd738dcf88fe6a8598240dc1e92e55eefd7 Mon Sep 17 00:00:00 2001 From: Milan Hauth Date: Tue, 30 Dec 2025 18:00:23 +0100 Subject: [PATCH 24/28] download images: add resolve_image_url --- src/substack2markdown/substack_scraper.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/src/substack2markdown/substack_scraper.py b/src/substack2markdown/substack_scraper.py index 7954ddc6..9e129131 100644 --- a/src/substack2markdown/substack_scraper.py +++ b/src/substack2markdown/substack_scraper.py @@ -83,6 +83,15 @@ def sanitize_image_filename(url: str) -> str: return filename +def resolve_image_url(url: str) -> str: + """Get the original image URL.""" + # https://substackcdn.com/image/fetch/xxx/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fxxx + if url.startswith("https://substackcdn.com/image/fetch/"): + # substackcdn.com returns a compressed version of the original image + url = "https://" + unquote(url.split("/https%3A%2F%2F")[1]) + return url + + def get_post_slug(url: str) -> str: match = re.search(r'/p/([^/]+)', url) return match.group(1) if match else 'unknown_post' @@ -848,6 +857,7 @@ async def process_markdown_images( for match in pattern.finditer(md_content): buf.write(md_content[last_end:match.start()]) url = match.group(1) + url = resolve_image_url(url) filename = sanitize_image_filename(url) format_vars = { **self.format_vars, From 0997609d1e67cefd1e9daf11014ced9ff07ca9bf Mon Sep 17 00:00:00 2001 From: Milan Hauth Date: Tue, 30 Dec 2025 18:09:40 +0100 Subject: [PATCH 25/28] fix type of like_count --- src/substack2markdown/substack_scraper.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/substack2markdown/substack_scraper.py b/src/substack2markdown/substack_scraper.py index 9e129131..64a201d1 100644 --- a/src/substack2markdown/substack_scraper.py +++ b/src/substack2markdown/substack_scraper.py @@ -371,6 +371,7 @@ def extract_post_data(self, soup: BeautifulSoup) -> Tuple[str, str, str, str, st if like_count_element and like_count_element.text.strip().isdigit() else "0" ) + like_count = int(like_count) # Post content content_element = soup.select_one("div.available-content") From 4ecb5e25f7a8e6fccb59dcec356d1a7ade27de33 Mon Sep 17 00:00:00 2001 From: Milan Hauth Date: Wed, 31 Dec 2025 17:48:43 +0100 Subject: [PATCH 26/28] fix url loop in scrape_posts --- src/substack2markdown/substack_scraper.py | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/src/substack2markdown/substack_scraper.py b/src/substack2markdown/substack_scraper.py index 64a201d1..adafad65 100644 --- a/src/substack2markdown/substack_scraper.py +++ b/src/substack2markdown/substack_scraper.py @@ -559,9 +559,8 @@ async def scrape_posts(self, num_posts_to_scrape: int = 0) -> None: posts_json_dir = os.path.dirname(posts_json_path) posts_data = [] - count = 0 - total = num_posts_to_scrape if num_posts_to_scrape != 0 else len(self.post_urls) - for url in tqdm(self.post_urls, total=total): + post_urls_slice = self.post_urls if num_posts_to_scrape == 0 else self.post_urls[:num_posts_to_scrape] + for url in tqdm(post_urls_slice): try: post_slug = url.split("/")[-1] self.format_vars["post_slug"] = post_slug @@ -604,7 +603,6 @@ async def scrape_posts(self, num_posts_to_scrape: int = 0) -> None: else: soup = await self.get_url_soup(url) if soup is None: - total += 1 continue title, subtitle, like_count, date, md = self.extract_post_data(soup) post_preloads = await self.get_window_preloads(soup) @@ -697,9 +695,6 @@ async def scrape_posts(self, num_posts_to_scrape: int = 0) -> None: except Exception as e: print(f"Error scraping post: {e}") # raise e # debug - count += 1 - if num_posts_to_scrape != 0 and count == num_posts_to_scrape: - break self.save_posts_data_json(posts_data) self.generate_main_md_file() self.generate_main_html_file() From e5b0f177172dca4bb1cca4f3689cdafbb3436840 Mon Sep 17 00:00:00 2001 From: Milan Hauth Date: Wed, 31 Dec 2025 18:09:53 +0100 Subject: [PATCH 27/28] remove parameter offline --- src/substack2markdown/substack_scraper.py | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/src/substack2markdown/substack_scraper.py b/src/substack2markdown/substack_scraper.py index adafad65..55f5299f 100644 --- a/src/substack2markdown/substack_scraper.py +++ b/src/substack2markdown/substack_scraper.py @@ -670,6 +670,10 @@ async def scrape_posts(self, num_posts_to_scrape: int = 0) -> None: # Convert markdown to HTML and save html_content = self.md_to_html(md) + # if self.args.offline: + # html_content = post_preloads["post"]["body_html"] + # else: + # html_content = self.md_to_html(md) self.save_to_html_file(html_filepath, html_content) post = { @@ -1087,11 +1091,16 @@ def parse_args() -> argparse.Namespace: default=0, help="The number of posts to scrape. If 0 or not provided, all posts will be scraped.", ) - parser.add_argument( - "--offline", # args.offline - action="store_true", - help="Use existing JSON files to render Markdown and HTML files.", - ) + # this was based on the wrong assumption + # that post_preloads JSON data contains the same body_html as the HTML page, but + # post_preloads["post"]["body_html"] contains HTML components with "data-attrs" attributes + # str(soup.select_one("div.available-content")) is clean HTML + # TODO convert HTML components to clean HTML + # parser.add_argument( + # "--offline", # args.offline + # action="store_true", + # help="Use existing JSON files to render Markdown and HTML files.", + # ) parser.add_argument( "-p", "--premium", @@ -1203,6 +1212,8 @@ def parse_args() -> argparse.Namespace: async def async_main(): args = parse_args() + args.offline = False + if args.config: with open(args.config) as f: config = json.load(f) From 55b7e915d2f8c1e1262b692e334c2149eaad739d Mon Sep 17 00:00:00 2001 From: Milan Hauth Date: Wed, 31 Dec 2025 18:28:49 +0100 Subject: [PATCH 28/28] add process_markdown_links --- src/substack2markdown/substack_scraper.py | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/src/substack2markdown/substack_scraper.py b/src/substack2markdown/substack_scraper.py index 55f5299f..0bc32838 100644 --- a/src/substack2markdown/substack_scraper.py +++ b/src/substack2markdown/substack_scraper.py @@ -617,6 +617,8 @@ async def scrape_posts(self, num_posts_to_scrape: int = 0) -> None: with tqdm(total=total_images, desc=f"Downloading images for {post_slug}", leave=False) as img_pbar: md = await self.process_markdown_images(md, img_pbar) + md = self.process_markdown_links(md) + if True: comments_html = None comments_num = None @@ -878,6 +880,25 @@ async def process_markdown_images( buf.write(md_content[last_end:]) return buf.getvalue() + def process_markdown_links(self, md_content): + # patch links to other posts of this publication + pattern = re.compile(r'\]\(https://' + self.publication_handle + r'\.substack\.com/p/([^\s\)]+)\)') + md_directory = self.format_vars["md_directory"] + output_directory = self.format_vars["output_directory"] + def get_replacement(match): + post_slug = match.group(1) + md_filepath = os.path.join( + output_directory, + self.md_path_template.substitute({ + **self.format_vars, + "post_slug": post_slug, + }) + ) + md_filepath_rel = os.path.relpath(md_filepath, md_directory) + return '](' + md_filepath_rel + ')' + md_content = re.sub(pattern, get_replacement, md_content) + return md_content + class SubstackScraper(BaseSubstackScraper): async def get_url_soup(self, url: str) -> Optional[BeautifulSoup]: