From c520fb3a8d64a6dde29d6f8d269213f9a14fbd6e Mon Sep 17 00:00:00 2001
From: Milan Hauth <milahu@gmail.com>
Date: Sat, 27 Dec 2025 18:39:38 +0100
Subject: [PATCH 01/28] add setup.py

---
 config.py                                           |   2 --
 data/README.md                                      |   3 ---
 src/substack2markdown/__init__.py                   |   0
 .../substack2markdown/assets/author_template.html   |   0
 .../substack2markdown/assets}/css/essay-styles.css  |   0
 .../substack2markdown/assets}/css/style.css         |   0
 .../substack2markdown/assets}/images/screenshot.png | Bin
 .../substack2markdown/assets}/js/populate-essays.js |   0
 .../substack2markdown/substack_scraper.py           |   0
 substack_html_pages/README.md                       |   3 ---
 10 files changed, 8 deletions(-)
 delete mode 100644 config.py
 delete mode 100644 data/README.md
 create mode 100644 src/substack2markdown/__init__.py
 rename author_template.html => src/substack2markdown/assets/author_template.html (100%)
 rename {assets => src/substack2markdown/assets}/css/essay-styles.css (100%)
 rename {assets => src/substack2markdown/assets}/css/style.css (100%)
 rename {assets => src/substack2markdown/assets}/images/screenshot.png (100%)
 rename {assets => src/substack2markdown/assets}/js/populate-essays.js (100%)
 rename substack_scraper.py => src/substack2markdown/substack_scraper.py (100%)
 delete mode 100644 substack_html_pages/README.md

diff --git a/config.py b/config.py
deleted file mode 100644
index 8fc6bff2..00000000
--- a/config.py
+++ /dev/null
@@ -1,2 +0,0 @@
-EMAIL = "your-email@domain.com"
-PASSWORD = "your-password"
diff --git a/data/README.md b/data/README.md
deleted file mode 100644
index 27476ca6..00000000
--- a/data/README.md
+++ /dev/null
@@ -1,3 +0,0 @@
-This directory will be used to store `.json` files for each writer 
-containing metadata that is used to populate a `.html` file for that
-author. 
\ No newline at end of file
diff --git a/src/substack2markdown/__init__.py b/src/substack2markdown/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/author_template.html b/src/substack2markdown/assets/author_template.html
similarity index 100%
rename from author_template.html
rename to src/substack2markdown/assets/author_template.html
diff --git a/assets/css/essay-styles.css b/src/substack2markdown/assets/css/essay-styles.css
similarity index 100%
rename from assets/css/essay-styles.css
rename to src/substack2markdown/assets/css/essay-styles.css
diff --git a/assets/css/style.css b/src/substack2markdown/assets/css/style.css
similarity index 100%
rename from assets/css/style.css
rename to src/substack2markdown/assets/css/style.css
diff --git a/assets/images/screenshot.png b/src/substack2markdown/assets/images/screenshot.png
similarity index 100%
rename from assets/images/screenshot.png
rename to src/substack2markdown/assets/images/screenshot.png
diff --git a/assets/js/populate-essays.js b/src/substack2markdown/assets/js/populate-essays.js
similarity index 100%
rename from assets/js/populate-essays.js
rename to src/substack2markdown/assets/js/populate-essays.js
diff --git a/substack_scraper.py b/src/substack2markdown/substack_scraper.py
similarity index 100%
rename from substack_scraper.py
rename to src/substack2markdown/substack_scraper.py
diff --git a/substack_html_pages/README.md b/substack_html_pages/README.md
deleted file mode 100644
index 0931cf8e..00000000
--- a/substack_html_pages/README.md
+++ /dev/null
@@ -1,3 +0,0 @@
-This directory will be used to store `.html` files for each writer that will enable you 
-to browse and sort the downloaded markdown files for a given writer. One `.html` file 
-will be created for each writer. 
\ No newline at end of file

From c73855e0a8aa17b916863c895acf8d6e741a2cf3 Mon Sep 17 00:00:00 2001
From: Milan Hauth <milahu@gmail.com>
Date: Sat, 27 Dec 2025 19:05:33 +0100
Subject: [PATCH 02/28] add parameters: config email password

---
 src/substack2markdown/substack_scraper.py | 42 ++++++++++++++++++-----
 1 file changed, 33 insertions(+), 9 deletions(-)

diff --git a/src/substack2markdown/substack_scraper.py b/src/substack2markdown/substack_scraper.py
index 126d260d..734dc553 100644
--- a/src/substack2markdown/substack_scraper.py
+++ b/src/substack2markdown/substack_scraper.py
@@ -21,7 +21,6 @@
 from selenium.common.exceptions import SessionNotCreatedException
 from selenium.webdriver.chrome.service import Service
 from urllib.parse import urlparse
-from config import EMAIL, PASSWORD
 
 USE_PREMIUM: bool = True  # Set to True if you want to login to Substack and convert paid for posts
 BASE_SUBSTACK_URL: str = "https://www.thefitzwilliam.com/"  # Substack you want to convert to markdown
@@ -70,9 +69,10 @@ def generate_html_file(author_name: str) -> None:
 
 
 class BaseSubstackScraper(ABC):
-    def __init__(self, base_substack_url: str, md_save_dir: str, html_save_dir: str):
+    def __init__(self, args, base_substack_url: str, md_save_dir: str, html_save_dir: str):
         if not base_substack_url.endswith("/"):
             base_substack_url += "/"
+        self.args = args
         self.base_substack_url: str = base_substack_url
 
         self.writer_name: str = extract_main_part(base_substack_url)
@@ -371,8 +371,8 @@ def scrape_posts(self, num_posts_to_scrape: int = 0) -> None:
 
 
 class SubstackScraper(BaseSubstackScraper):
-    def __init__(self, base_substack_url: str, md_save_dir: str, html_save_dir: str):
-        super().__init__(base_substack_url, md_save_dir, html_save_dir)
+    def __init__(self, args, base_substack_url: str, md_save_dir: str, html_save_dir: str):
+        super().__init__(args, base_substack_url, md_save_dir, html_save_dir)
 
     def get_url_soup(self, url: str) -> Optional[BeautifulSoup]:
         """
@@ -392,6 +392,7 @@ def get_url_soup(self, url: str) -> Optional[BeautifulSoup]:
 class PremiumSubstackScraper(BaseSubstackScraper):
     def __init__(
         self,
+        args,
         base_substack_url: str,
         md_save_dir: str,
         html_save_dir: str,
@@ -400,7 +401,7 @@ def __init__(
         edge_driver_path: str = '',
         user_agent: str = ''
     ) -> None:
-        super().__init__(base_substack_url, md_save_dir, html_save_dir)
+        super().__init__(args, base_substack_url, md_save_dir, html_save_dir)
 
         options = EdgeOptions()
         if headless:
@@ -459,8 +460,8 @@ def login(self) -> None:
         # Email and password
         email = self.driver.find_element(By.NAME, "email")
         password = self.driver.find_element(By.NAME, "password")
-        email.send_keys(EMAIL)
-        password.send_keys(PASSWORD)
+        email.send_keys(self.args.email)
+        password.send_keys(self.args.password)
 
         # Find the submit button and click it.
         submit = self.driver.find_element(By.XPATH, "//*[@id=\"substack-login\"]/div[2]/div[2]/form/button")
@@ -494,6 +495,15 @@ def get_url_soup(self, url: str) -> BeautifulSoup:
 
 def parse_args() -> argparse.Namespace:
     parser = argparse.ArgumentParser(description="Scrape a Substack site.")
+    parser.add_argument(
+        "--config", type=str, help="JSON config file with email and password."
+    )
+    parser.add_argument(
+        "--email", type=str, help="Login E-Mail."
+    )
+    parser.add_argument(
+        "--password", type=str, help="Login password."
+    )
     parser.add_argument(
         "-u", "--url", type=str, help="The base URL of the Substack site to scrape."
     )
@@ -556,17 +566,29 @@ def main():
     if args.html_directory is None:
         args.html_directory = BASE_HTML_DIR
 
+    if args.config:
+        with open(args.config) as f:
+            config = json.load(f)
+        args.email = config["email"]
+        args.password = config["password"]
+        # TODO more
+
+    assert args.email
+    assert args.password
+
     if args.url:
         if args.premium:
             scraper = PremiumSubstackScraper(
-                args.url,
+                args=args,
+                base_substack_url=args.url,
                 headless=args.headless,
                 md_save_dir=args.directory,
                 html_save_dir=args.html_directory
             )
         else:
             scraper = SubstackScraper(
-                args.url,
+                args=args,
+                base_substack_url=args.url,
                 md_save_dir=args.directory,
                 html_save_dir=args.html_directory
             )
@@ -575,6 +597,7 @@ def main():
     else:  # Use the hardcoded values at the top of the file
         if USE_PREMIUM:
             scraper = PremiumSubstackScraper(
+                args=args,
                 base_substack_url=BASE_SUBSTACK_URL,
                 md_save_dir=args.directory,
                 html_save_dir=args.html_directory,
@@ -583,6 +606,7 @@ def main():
             )
         else:
             scraper = SubstackScraper(
+                args=args,
                 base_substack_url=BASE_SUBSTACK_URL,
                 md_save_dir=args.directory,
                 html_save_dir=args.html_directory

From 15fc25c71720c2afba4b3bafceabb91b1ed672bb Mon Sep 17 00:00:00 2001
From: Milan Hauth <milahu@gmail.com>
Date: Sun, 28 Dec 2025 07:55:10 +0100
Subject: [PATCH 03/28] add parameters: assets-dir author-template

---
 src/substack2markdown/substack_scraper.py | 73 +++++++++++------------
 1 file changed, 35 insertions(+), 38 deletions(-)

diff --git a/src/substack2markdown/substack_scraper.py b/src/substack2markdown/substack_scraper.py
index 734dc553..297f0937 100644
--- a/src/substack2markdown/substack_scraper.py
+++ b/src/substack2markdown/substack_scraper.py
@@ -26,6 +26,7 @@
 BASE_SUBSTACK_URL: str = "https://www.thefitzwilliam.com/"  # Substack you want to convert to markdown
 BASE_MD_DIR: str = "substack_md_files"  # Name of the directory we'll save the .md essay files
 BASE_HTML_DIR: str = "substack_html_pages"  # Name of the directory we'll save the .html essay files
+ASSETS_DIR: str = os.path.dirname(__file__) + "/assets"
 HTML_TEMPLATE: str = "author_template.html"  # HTML template to use for the author page
 JSON_DATA_DIR: str = "data"
 NUM_POSTS_TO_SCRAPE: int = 3  # Set to 0 if you want all posts
@@ -37,12 +38,12 @@ def extract_main_part(url: str) -> str:
     # present
 
 
-def generate_html_file(author_name: str) -> None:
+def generate_html_file(args, author_name: str) -> None:
     """
     Generates a HTML file for the given author.
     """
-    if not os.path.exists(BASE_HTML_DIR):
-        os.makedirs(BASE_HTML_DIR)
+    if not os.path.exists(args.html_directory):
+        os.makedirs(args.html_directory)
 
     # Read JSON data
     json_path = os.path.join(JSON_DATA_DIR, f'{author_name}.json')
@@ -52,7 +53,7 @@ def generate_html_file(author_name: str) -> None:
     # Convert JSON data to a JSON string for embedding
     embedded_json_data = json.dumps(essays_data, ensure_ascii=False, indent=4)
 
-    with open(HTML_TEMPLATE, 'r', encoding='utf-8') as file:
+    with open(args.author_template, 'r', encoding='utf-8') as file:
         html_template = file.read()
 
     # Insert the JSON string into the script tag in the HTML template
@@ -63,7 +64,7 @@ def generate_html_file(author_name: str) -> None:
     html_with_author = html_with_data.replace('author_name', author_name)
 
     # Write the modified HTML to a new file
-    html_output_path = os.path.join(BASE_HTML_DIR, f'{author_name}.html')
+    html_output_path = os.path.join(args.html_directory, f'{author_name}.html')
     with open(html_output_path, 'w', encoding='utf-8') as file:
         file.write(html_with_author)
 
@@ -193,7 +194,7 @@ def save_to_html_file(self, filepath: str, content: str) -> None:
 
         # Calculate the relative path from the HTML file to the CSS file
         html_dir = os.path.dirname(filepath)
-        css_path = os.path.relpath("./assets/css/essay-styles.css", html_dir)
+        css_path = os.path.relpath(args.assets_dir + "/css/essay-styles.css", html_dir)
         css_path = css_path.replace("\\", "/")  # Ensure forward slashes for web paths
 
         html_content = f"""
@@ -367,7 +368,7 @@ def scrape_posts(self, num_posts_to_scrape: int = 0) -> None:
             if num_posts_to_scrape != 0 and count == num_posts_to_scrape:
                 break
         self.save_essays_data_to_json(essays_data=essays_data)
-        generate_html_file(author_name=self.writer_name)
+        generate_html_file(self.args, author_name=self.writer_name)
 
 
 class SubstackScraper(BaseSubstackScraper):
@@ -505,14 +506,22 @@ def parse_args() -> argparse.Namespace:
         "--password", type=str, help="Login password."
     )
     parser.add_argument(
-        "-u", "--url", type=str, help="The base URL of the Substack site to scrape."
+        "-u",
+        "--url", # args.url
+        type=str,
+        default=BASE_SUBSTACK_URL,
+        help="The base URL of the Substack site to scrape."
     )
     parser.add_argument(
-        "-d", "--directory", type=str, help="The directory to save scraped posts."
+        "-d",
+        "--directory", # args.directory
+        type=str,
+        default=BASE_MD_DIR,
+        help="The directory to save scraped posts."
     )
     parser.add_argument(
         "-n",
-        "--number",
+        "--number", # args.number
         type=int,
         default=0,
         help="The number of posts to scrape. If 0 or not provided, all posts will be scraped.",
@@ -523,6 +532,15 @@ def parse_args() -> argparse.Namespace:
         action="store_true",
         help="Include -p in command to use the Premium Substack Scraper with selenium.",
     )
+    parser.add_argument(
+        "--assets-dir", # args.assets_dir
+        default=ASSETS_DIR,
+        help=f"Path to assets directory. Default: {ASSETS_DIR!r}",
+    )
+    parser.add_argument(
+        "--author-template", # args.author_template
+        help=f"Path to author_template.html. Default: {repr('{assets_dir}/' + HTML_TEMPLATE)}",
+    )
     parser.add_argument(
         "--headless",
         action="store_true",
@@ -549,9 +567,10 @@ def parse_args() -> argparse.Namespace:
         "passing captcha in headless mode",
     )
     parser.add_argument(
-        "--html-directory",
+        "--html-directory", # args.html_directory
         type=str,
-        help="The directory to save scraped posts as HTML files.",
+        default=BASE_HTML_DIR,
+        help=f"The directory to save scraped posts as HTML files. Default: {BASE_HTML_DIR!r}",
     )
 
     return parser.parse_args()
@@ -560,12 +579,6 @@ def parse_args() -> argparse.Namespace:
 def main():
     args = parse_args()
 
-    if args.directory is None:
-        args.directory = BASE_MD_DIR
-
-    if args.html_directory is None:
-        args.html_directory = BASE_HTML_DIR
-
     if args.config:
         with open(args.config) as f:
             config = json.load(f)
@@ -576,7 +589,10 @@ def main():
     assert args.email
     assert args.password
 
-    if args.url:
+    if not args.author_template:
+        args.author_template = args.assets_dir + "/" + HTML_TEMPLATE
+
+    if True:
         if args.premium:
             scraper = PremiumSubstackScraper(
                 args=args,
@@ -594,25 +610,6 @@ def main():
             )
         scraper.scrape_posts(args.number)
 
-    else:  # Use the hardcoded values at the top of the file
-        if USE_PREMIUM:
-            scraper = PremiumSubstackScraper(
-                args=args,
-                base_substack_url=BASE_SUBSTACK_URL,
-                md_save_dir=args.directory,
-                html_save_dir=args.html_directory,
-                edge_path=args.edge_path,
-                edge_driver_path=args.edge_driver_path
-            )
-        else:
-            scraper = SubstackScraper(
-                args=args,
-                base_substack_url=BASE_SUBSTACK_URL,
-                md_save_dir=args.directory,
-                html_save_dir=args.html_directory
-            )
-        scraper.scrape_posts(num_posts_to_scrape=NUM_POSTS_TO_SCRAPE)
-
 
 if __name__ == "__main__":
     main()

From 8d7676f947cfad94cf32eb013ed12f32c2bc90f8 Mon Sep 17 00:00:00 2001
From: Milan Hauth <milahu@gmail.com>
Date: Sun, 28 Dec 2025 10:05:02 +0100
Subject: [PATCH 04/28] use selenium_driverless

---
 requirements.txt                          |   3 +-
 src/substack2markdown/substack_scraper.py | 205 ++++++++++++----------
 2 files changed, 113 insertions(+), 95 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index c58926a7..af704d1a 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,7 +1,6 @@
 bs4==0.0.1
 html2text==2020.1.16
 requests==2.31.0
-selenium==4.16.0
+selenium-driverless
 tqdm==4.66.1
-webdriver_manager==4.0.1
 Markdown==3.6
diff --git a/src/substack2markdown/substack_scraper.py b/src/substack2markdown/substack_scraper.py
index 297f0937..0f22adc5 100644
--- a/src/substack2markdown/substack_scraper.py
+++ b/src/substack2markdown/substack_scraper.py
@@ -4,7 +4,9 @@
 from abc import ABC, abstractmethod
 from typing import List, Optional, Tuple
 from time import sleep
-
+import asyncio
+import atexit
+import signal
 
 import html2text
 import markdown
@@ -14,12 +16,8 @@
 from tqdm import tqdm
 from xml.etree import ElementTree as ET
 
-from selenium import webdriver
-from selenium.webdriver.common.by import By
-from webdriver_manager.microsoft import EdgeChromiumDriverManager
-from selenium.webdriver.edge.options import Options as EdgeOptions
-from selenium.common.exceptions import SessionNotCreatedException
-from selenium.webdriver.chrome.service import Service
+from selenium_driverless import webdriver
+from selenium_driverless.types.by import By
 from urllib.parse import urlparse
 
 USE_PREMIUM: bool = True  # Set to True if you want to login to Substack and convert paid for posts
@@ -70,6 +68,15 @@ def generate_html_file(args, author_name: str) -> None:
 
 
 class BaseSubstackScraper(ABC):
+    def __await__(self):
+        return self._async_init().__await__()
+
+    async def __aenter__(self):
+        return await self
+
+    async def __aexit__(self, exc_type, exc, tb):
+        pass
+
     def __init__(self, args, base_substack_url: str, md_save_dir: str, html_save_dir: str):
         if not base_substack_url.endswith("/"):
             base_substack_url += "/"
@@ -92,6 +99,10 @@ def __init__(self, args, base_substack_url: str, md_save_dir: str, html_save_dir
         self.keywords: List[str] = ["about", "archive", "podcast"]
         self.post_urls: List[str] = self.get_all_post_urls()
 
+    async def _async_init(self):
+        self._loop = asyncio.get_running_loop()
+        return self
+
     def get_all_post_urls(self) -> List[str]:
         """
         Attempts to fetch URLs from sitemap.xml, falling back to feed.xml if necessary.
@@ -326,7 +337,7 @@ def save_essays_data_to_json(self, essays_data: list) -> None:
         with open(json_path, 'w', encoding='utf-8') as f:
             json.dump(essays_data, f, ensure_ascii=False, indent=4)
 
-    def scrape_posts(self, num_posts_to_scrape: int = 0) -> None:
+    async def scrape_posts(self, num_posts_to_scrape: int = 0) -> None:
         """
         Iterates over all posts and saves them as markdown and html files
         """
@@ -340,8 +351,7 @@ def scrape_posts(self, num_posts_to_scrape: int = 0) -> None:
                 md_filepath = os.path.join(self.md_save_dir, md_filename)
                 html_filepath = os.path.join(self.html_save_dir, html_filename)
 
-                if not os.path.exists(md_filepath):
-                    soup = self.get_url_soup(url)
+                    soup = await self.get_url_soup(url)
                     if soup is None:
                         total += 1
                         continue
@@ -398,100 +408,109 @@ def __init__(
         md_save_dir: str,
         html_save_dir: str,
         headless: bool = False,
-        edge_path: str = '',
-        edge_driver_path: str = '',
+        chromium_path: str = '',
         user_agent: str = ''
     ) -> None:
         super().__init__(args, base_substack_url, md_save_dir, html_save_dir)
 
-        options = EdgeOptions()
+        self.driver = None
+
+        def exit_handler(signum, frame):
+            print()
+            print(f"exit_handler: received signal {signum}")
+            try:
+                asyncio.get_event_loop().create_task(self._cleanup_sync())
+            except Exception:
+                pass
+            raise SystemExit(0)
+
+        signal.signal(signal.SIGINT, exit_handler)
+        signal.signal(signal.SIGTERM, exit_handler)
+
+        atexit.register(self._cleanup_sync)
+
+        options = webdriver.ChromeOptions()
+        self.chrome_options = options
         if headless:
-            # modern headless flag (works better with recent Edge/Chromium)
+            # modern headless flag (works better with recent Chromium)
             options.add_argument("--headless=new")
-        if edge_path:
-            options.binary_location = edge_path
+        if chromium_path:
+            options.binary_location = chromium_path
         if user_agent:
             options.add_argument(f"user-agent={user_agent}")
-    
-        if isinstance(options, EdgeOptions):
-            os.environ.setdefault("SE_DRIVER_MIRROR_URL", "https://msedgedriver.microsoft.com")
-        elif isinstance(options, ChromeOptions):
-            os.environ.setdefault("SE_DRIVER_MIRROR_URL", "https://chromedriver.storage.googleapis.com")
 
-        
-        self.driver = None
+    async def _async_init(self):
+        self._loop = asyncio.get_running_loop()
 
-        # 1) Prefer an explicit driver path (manual download)
-        if edge_driver_path and os.path.exists(edge_driver_path):
-            service = Service(executable_path=edge_driver_path)
-            self.driver = webdriver.Edge(service=service, options=options)
-        else:
-            # 2) Try webdriver_manager (needs network/DNS)
-            try:
-                service = Service(EdgeChromiumDriverManager().install())
-                self.driver = webdriver.Edge(service=service, options=options)
-            except Exception as e:
-                print("webdriver_manager could not download msedgedriver (network/DNS). Falling back to Selenium Manager.")
-                # 3) Selenium Manager fallback (still needs network; but avoids webdriver_manager)
+        await self._start_driver()
+        await self.login()
+        return self
+
+    async def _start_driver(self):
+        self.driver = await webdriver.Chrome(options=self.chrome_options)
+
+    async def __aexit__(self, exc_type, exc, tb):
+        await self.close()
+
+    async def close(self) -> None:
+        if self.driver:
+            await self.driver.quit()
+
+    def _cleanup_sync(self):
+        try:
+            if not self.driver:
+                return
+            proc = self.driver._process
+            if proc and proc.poll() is None:
+                proc.terminate()
                 try:
-                    # IMPORTANT: ensure no stale driver in PATH (e.g. C:\Windows\msedgedriver.exe v138)
-                    self.driver = webdriver.Edge(options=options)
-                except SessionNotCreatedException as se:
-                    raise RuntimeError(
-                        "Selenium Manager fallback failed due to driver/browser mismatch.\n"
-                        "Fix by either: (a) removing stale msedgedriver in PATH (e.g. C:\\Windows\\msedgedriver.exe) and replace with a fresh one downloaded from https://developer.microsoft.com/en-us/microsoft-edge/tools/webdriver, "
-                        "or (b) pass --edge-driver-path to a manually downloaded driver that matches your Edge version."
-                    ) from se
+                    proc.wait(timeout=1)
+                except Exception:
+                    proc.kill()
+        except Exception as exc:
+            print("_cleanup_sync failed:", exc)
+
+    async def login(self):
+        await self.driver.get("https://substack.com/sign-in")
+        await asyncio.sleep(2)
+
+        signin = await self.driver.find_element(
+            By.XPATH, "//a[contains(@class,'login-option')]"
+        )
+        await signin.click()
 
-        self.login()
+        await asyncio.sleep(2)
 
-    def login(self) -> None:
-        """
-        This method logs into Substack using Selenium
-        """
-        self.driver.get("https://substack.com/sign-in")
-        sleep(3)
+        email = await self.driver.find_element(By.NAME, "email")
+        password = await self.driver.find_element(By.NAME, "password")
 
-        signin_with_password = self.driver.find_element(
-            By.XPATH, "//a[@class='login-option substack-login__login-option']"
+        await email.send_keys(self.args.email)
+        await password.send_keys(self.args.password)
+
+        submit = await self.driver.find_element(
+            By.XPATH, "//*[@id='substack-login']//form//button"
         )
-        signin_with_password.click()
-        sleep(3)
-
-        # Email and password
-        email = self.driver.find_element(By.NAME, "email")
-        password = self.driver.find_element(By.NAME, "password")
-        email.send_keys(self.args.email)
-        password.send_keys(self.args.password)
-
-        # Find the submit button and click it.
-        submit = self.driver.find_element(By.XPATH, "//*[@id=\"substack-login\"]/div[2]/div[2]/form/button")
-        submit.click()
-        sleep(30)  # Wait for the page to load
-
-        if self.is_login_failed():
-            raise Exception(
-                "Warning: Login unsuccessful. Please check your email and password, or your account status.\n"
-                "Use the non-premium scraper for the non-paid posts. \n"
-                "If running headless, run non-headlessly to see if blocked by Captcha."
-            )
+        await submit.click()
 
-    def is_login_failed(self) -> bool:
+        await asyncio.sleep(8)
+
+        if await self.is_login_failed():
+            raise RuntimeError("Substack login failed")
+
+    async def is_login_failed(self):
         """
         Check for the presence of the 'error-container' to indicate a failed login attempt.
         """
-        error_container = self.driver.find_elements(By.ID, 'error-container')
-        return len(error_container) > 0 and error_container[0].is_displayed()
+        elements = await self.driver.find_elements(By.ID, "error-container")
+        return bool(elements)
 
-    def get_url_soup(self, url: str) -> BeautifulSoup:
+    async def get_url_soup(self, url: str):
         """
         Gets soup from URL using logged in selenium driver
         """
-        try:
-            self.driver.get(url)
-            return BeautifulSoup(self.driver.page_source, "html.parser")
-        except Exception as e:
-            raise ValueError(f"Error fetching page: {e}") from e
+        await self.driver.get(url)
+        html = await self.driver.page_source
+        return BeautifulSoup(html, "html.parser")
 
 
 def parse_args() -> argparse.Namespace:
@@ -548,16 +567,10 @@ def parse_args() -> argparse.Namespace:
         "Scraper.",
     )
     parser.add_argument(
-        "--edge-path",
+        "--chromium-path", # args.chromium_path
         type=str,
         default="",
-        help='Optional: The path to the Edge browser executable (i.e. "path_to_msedge.exe").',
-    )
-    parser.add_argument(
-        "--edge-driver-path",
-        type=str,
-        default="",
-        help='Optional: The path to the Edge WebDriver executable (i.e. "path_to_msedgedriver.exe").',
+        help='Optional: The path to the Chromium browser executable (i.e. "path/to/chromium").',
     )
     parser.add_argument(
         "--user-agent",
@@ -576,7 +589,7 @@ def parse_args() -> argparse.Namespace:
     return parser.parse_args()
 
 
-def main():
+async def async_main():
     args = parse_args()
 
     if args.config:
@@ -594,7 +607,7 @@ def main():
 
     if True:
         if args.premium:
-            scraper = PremiumSubstackScraper(
+            scraper = await PremiumSubstackScraper(
                 args=args,
                 base_substack_url=args.url,
                 headless=args.headless,
@@ -602,13 +615,19 @@ def main():
                 html_save_dir=args.html_directory
             )
         else:
-            scraper = SubstackScraper(
+            scraper = await SubstackScraper(
                 args=args,
                 base_substack_url=args.url,
                 md_save_dir=args.directory,
                 html_save_dir=args.html_directory
             )
-        scraper.scrape_posts(args.number)
+
+        await scraper.scrape_posts(args.number)
+        await scraper.close()
+
+
+def main():
+    asyncio.run(async_main())
 
 
 if __name__ == "__main__":

From 4af8b45d9a477df427a352b5038ea14d983451ce Mon Sep 17 00:00:00 2001
From: Milan Hauth <milahu@gmail.com>
Date: Sun, 28 Dec 2025 10:07:52 +0100
Subject: [PATCH 05/28] replace existing files

---
 src/substack2markdown/substack_scraper.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/substack2markdown/substack_scraper.py b/src/substack2markdown/substack_scraper.py
index 0f22adc5..91c3f2ac 100644
--- a/src/substack2markdown/substack_scraper.py
+++ b/src/substack2markdown/substack_scraper.py
@@ -178,7 +178,8 @@ def save_to_file(filepath: str, content: str) -> None:
         if not isinstance(content, str):
             raise ValueError("content must be a string")
 
-        if os.path.exists(filepath):
+        # if os.path.exists(filepath):
+        if False:
             print(f"File already exists: {filepath}")
             return
 
@@ -351,6 +352,8 @@ async def scrape_posts(self, num_posts_to_scrape: int = 0) -> None:
                 md_filepath = os.path.join(self.md_save_dir, md_filename)
                 html_filepath = os.path.join(self.html_save_dir, html_filename)
 
+                # if not os.path.exists(md_filepath):
+                if True:
                     soup = await self.get_url_soup(url)
                     if soup is None:
                         total += 1

From 4b8598af5affd08af4a56e1329f47959f520777c Mon Sep 17 00:00:00 2001
From: Milan Hauth <milahu@gmail.com>
Date: Sun, 28 Dec 2025 11:15:06 +0100
Subject: [PATCH 06/28] fixup: assets-dir

---
 src/substack2markdown/substack_scraper.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/substack2markdown/substack_scraper.py b/src/substack2markdown/substack_scraper.py
index 91c3f2ac..fc3bdabf 100644
--- a/src/substack2markdown/substack_scraper.py
+++ b/src/substack2markdown/substack_scraper.py
@@ -206,7 +206,7 @@ def save_to_html_file(self, filepath: str, content: str) -> None:
 
         # Calculate the relative path from the HTML file to the CSS file
         html_dir = os.path.dirname(filepath)
-        css_path = os.path.relpath(args.assets_dir + "/css/essay-styles.css", html_dir)
+        css_path = os.path.relpath(self.args.assets_dir + "/css/essay-styles.css", html_dir)
         css_path = css_path.replace("\\", "/")  # Ensure forward slashes for web paths
 
         html_content = f"""

From 5811bb5de77a0302e619936afaa7fbe67d45b307 Mon Sep 17 00:00:00 2001
From: Milan Hauth <milahu@gmail.com>
Date: Sun, 28 Dec 2025 12:34:02 +0100
Subject: [PATCH 07/28] download images

based on
https://github.com/timf34/Substack2Markdown/pull/26
---
 src/substack2markdown/substack_scraper.py | 186 +++++++++++++++++++++-
 1 file changed, 185 insertions(+), 1 deletion(-)

diff --git a/src/substack2markdown/substack_scraper.py b/src/substack2markdown/substack_scraper.py
index fc3bdabf..e2cc62ae 100644
--- a/src/substack2markdown/substack_scraper.py
+++ b/src/substack2markdown/substack_scraper.py
@@ -1,6 +1,13 @@
 import argparse
 import json
 import os
+import io
+import re
+import base64
+import hashlib
+import mimetypes
+from pathlib import Path
+from urllib.parse import urlparse, unquote
 from abc import ABC, abstractmethod
 from typing import List, Optional, Tuple
 from time import sleep
@@ -18,18 +25,54 @@
 
 from selenium_driverless import webdriver
 from selenium_driverless.types.by import By
-from urllib.parse import urlparse
 
 USE_PREMIUM: bool = True  # Set to True if you want to login to Substack and convert paid for posts
 BASE_SUBSTACK_URL: str = "https://www.thefitzwilliam.com/"  # Substack you want to convert to markdown
 BASE_MD_DIR: str = "substack_md_files"  # Name of the directory we'll save the .md essay files
 BASE_HTML_DIR: str = "substack_html_pages"  # Name of the directory we'll save the .html essay files
+BASE_IMAGE_DIR: str = "substack_images"
 ASSETS_DIR: str = os.path.dirname(__file__) + "/assets"
 HTML_TEMPLATE: str = "author_template.html"  # HTML template to use for the author page
 JSON_DATA_DIR: str = "data"
 NUM_POSTS_TO_SCRAPE: int = 3  # Set to 0 if you want all posts
 
 
+def count_images_in_markdown(md_content: str) -> int:
+    """Count number of Substack CDN image URLs in markdown content."""
+    # [![](https://substackcdn.com/image/fetch/x.png)](https://substackcdn.com/image/fetch/x.png)
+    # regex lookahead: match "...)" but not "...)]" suffix
+    pattern = re.compile(r'\(https://substackcdn\.com/image/fetch/[^\s\)]+\)(?=[^\]]|$)')
+    matches = re.findall(pattern, md_content)
+    return len(matches)
+
+
+def sanitize_image_filename(url: str) -> str:
+    """Create a safe filename from URL or content."""
+    # Extract original filename from CDN URL
+    if "substackcdn.com" in url:
+        # Get the actual image URL after the CDN parameters
+        original_url = unquote(url.split("/https%3A%2F%2F")[1])
+        filename = original_url.split("/")[-1]
+    else:
+        filename = url.split("/")[-1]
+
+    # Remove invalid characters
+    filename = re.sub(r'[<>:"/\\|?*]', '', filename)
+
+    # If filename is too long or empty, create hash-based name
+    if len(filename) > 100 or not filename:
+        hash_object = hashlib.md5(url.encode())
+        ext = mimetypes.guess_extension(requests.head(url).headers.get('content-type', '')) or '.jpg'
+        filename = f"{hash_object.hexdigest()}{ext}"
+
+    return filename
+
+
+def get_post_slug(url: str) -> str:
+    match = re.search(r'/p/([^/]+)', url)
+    return match.group(1) if match else 'unknown_post'
+
+
 def extract_main_part(url: str) -> str:
     parts = urlparse(url).netloc.split('.')  # Parse the URL to get the netloc, and split on '.'
     return parts[1] if parts[0] == 'www' else parts[0]  # Return the main part of the domain, while ignoring 'www' if
@@ -96,6 +139,9 @@ def __init__(self, args, base_substack_url: str, md_save_dir: str, html_save_dir
             os.makedirs(self.html_save_dir)
             print(f"Created html directory {self.html_save_dir}")
 
+        if not self.args.no_images:
+            os.makedirs(self.args.image_directory, exist_ok=True)
+
         self.keywords: List[str] = ["about", "archive", "podcast"]
         self.post_urls: List[str] = self.get_all_post_urls()
 
@@ -359,6 +405,13 @@ async def scrape_posts(self, num_posts_to_scrape: int = 0) -> None:
                         total += 1
                         continue
                     title, subtitle, like_count, date, md = self.extract_post_data(soup)
+
+                    if not self.args.no_images:
+                        total_images = count_images_in_markdown(md)
+                        post_slug = get_post_slug(url)
+                        with tqdm(total=total_images, desc=f"Downloading images for {post_slug}", leave=False) as img_pbar:
+                            md = await self.process_markdown_images(md, self.writer_name, post_slug, img_pbar)
+
                     self.save_to_file(md_filepath, md)
 
                     # Convert markdown to HTML and save
@@ -383,6 +436,56 @@ async def scrape_posts(self, num_posts_to_scrape: int = 0) -> None:
         self.save_essays_data_to_json(essays_data=essays_data)
         generate_html_file(self.args, author_name=self.writer_name)
 
+    async def download_image(
+            self,
+            url: str,
+            save_path: Path,
+            pbar: Optional[tqdm] = None
+        ) -> Optional[str]:
+        """Download image from URL and save to path."""
+        try:
+            response = requests.get(url, stream=True)
+            if response.status_code == 200:
+                save_path.parent.mkdir(parents=True, exist_ok=True)
+                with open(save_path, 'wb') as f:
+                    for chunk in response.iter_content(chunk_size=8192):
+                        if chunk:
+                            f.write(chunk)
+                if pbar:
+                    pbar.update(1)
+                return str(save_path)
+        except Exception as exc:
+            if pbar:
+                pbar.write(f"Error downloading image {url}: {str(exc)}")
+            # raise exc # debug
+        return None
+
+    async def process_markdown_images(
+            self,
+            md_content: str,
+            author: str,
+            post_slug: str,
+            pbar=None
+        ) -> str:
+        """Process markdown content to download images and update references."""
+        image_dir = Path(self.args.image_directory) / author / post_slug
+        # [![](https://substackcdn.com/image/fetch/x.png)](https://substackcdn.com/image/fetch/x.png)
+        pattern = re.compile(r'\(https://substackcdn\.com/image/fetch/[^\s\)]+\)')
+        buf = io.StringIO()
+        last_end = 0
+        for match in pattern.finditer(md_content):
+            buf.write(md_content[last_end:match.start()])
+            url = match.group(0).strip("()")
+            filename = sanitize_image_filename(url)
+            save_path = image_dir / filename
+            if not save_path.exists():
+                await self.download_image(url, save_path, pbar)
+            rel_path = os.path.relpath(save_path, Path(self.args.directory) / author)
+            buf.write(f"({rel_path})")
+            last_end = match.end()
+        buf.write(md_content[last_end:])
+        return buf.getvalue()
+
 
 class SubstackScraper(BaseSubstackScraper):
     def __init__(self, args, base_substack_url: str, md_save_dir: str, html_save_dir: str):
@@ -515,6 +618,76 @@ async def get_url_soup(self, url: str):
         html = await self.driver.page_source
         return BeautifulSoup(html, "html.parser")
 
+    async def download_image_FIXME(
+            self,
+            url: str,
+            save_path: Path,
+            pbar: Optional[tqdm] = None
+        ) -> Optional[str]:
+        """Download image using selenium_driverless"""
+
+        # NOTE for now this works with the default "def download_image"
+
+        # WONTFIX "fetch" fails due to CORS policy
+
+        # WONTFIX "canvas" does not return the original image bytes
+
+        # we could fetch images with CDP Network.getResponseBody
+        # but that requires lots of boilerplate code
+        # fix: use https://github.com/milahu/aiohttp_chromium
+
+        try:
+            # Execute JS fetch inside browser
+            result = await self.driver.execute_async_script(
+                """
+                const url = arguments[0];
+                const callback = arguments[arguments.length - 1];
+
+                const img = new Image();
+                img.crossOrigin = 'Anonymous'; // try to avoid CORS issues
+                img.onload = () => {
+                    try {
+                        const canvas = document.createElement('canvas');
+                        canvas.width = img.width;
+                        canvas.height = img.height;
+                        const ctx = canvas.getContext('2d');
+                        ctx.drawImage(img, 0, 0);
+                        const dataUrl = canvas.toDataURL('image/png'); // returns "data:image/png;base64,..."
+                        const base64 = dataUrl.split(',')[1]; // strip prefix
+                        callback({data: base64});
+                    } catch (err) {
+                        callback({error: err.message, stack: err.stack});
+                    }
+                };
+                img.onerror = (err) => {
+                    callback({error: 'Image load error', stack: err.toString()});
+                };
+                img.src = url;
+                """,
+                url
+            )
+
+            if isinstance(result, dict) and "error" in result:
+                raise RuntimeError(f"{result['error']}\nJS stack:\n{result['stack']}")
+
+            # Decode base64 to bytes
+            image_bytes = base64.b64decode(result)
+
+            save_path.parent.mkdir(parents=True, exist_ok=True)
+            with open(save_path, "wb") as f:
+                f.write(image_bytes)
+
+            if pbar:
+                pbar.update(1)
+
+            return str(save_path)
+
+        except Exception as exc:
+            if pbar:
+                pbar.write(f"Error downloading image {url}: {exc}")
+            # raise exc # debug
+            return None
+
 
 def parse_args() -> argparse.Namespace:
     parser = argparse.ArgumentParser(description="Scrape a Substack site.")
@@ -588,6 +761,17 @@ def parse_args() -> argparse.Namespace:
         default=BASE_HTML_DIR,
         help=f"The directory to save scraped posts as HTML files. Default: {BASE_HTML_DIR!r}",
     )
+    parser.add_argument(
+        "--image-directory", # args.image_directory
+        type=str,
+        default=BASE_IMAGE_DIR,
+        help=f"The directory to save scraped image files. Default: {BASE_IMAGE_DIR!r}",
+    )
+    parser.add_argument(
+        "--no-images", # args.no_images
+        action="store_true",
+        help=f"Do not download images.",
+    )
 
     return parser.parse_args()
 

From 153746f5b76d664526db9580259d3be6aa4bcd59 Mon Sep 17 00:00:00 2001
From: Milan Hauth <milahu@gmail.com>
Date: Sun, 28 Dec 2025 18:38:46 +0100
Subject: [PATCH 08/28] download comments

fix https://github.com/timf34/Substack2Markdown/issues/3
---
 src/substack2markdown/substack_scraper.py | 129 ++++++++++++++++++++++
 1 file changed, 129 insertions(+)

diff --git a/src/substack2markdown/substack_scraper.py b/src/substack2markdown/substack_scraper.py
index e2cc62ae..d2f2ad3a 100644
--- a/src/substack2markdown/substack_scraper.py
+++ b/src/substack2markdown/substack_scraper.py
@@ -363,6 +363,98 @@ def extract_post_data(self, soup: BeautifulSoup) -> Tuple[str, str, str, str, st
 
         return title, subtitle, like_count, date, md_content
 
+    async def get_window_preloads(self, soup):
+        # all comments are stored in javascript
+        # <script>window._preloads = JSON.parse("{\"isEU\":true,\"language\":\"en\",...}")</script>
+        # only some comments are rendered in html
+        # with buttons to "Expand full comment" and "Load More"
+        # see also
+        # https://www.selfpublife.com/p/automatically-expand-all-substack-comments
+        window_preloads = None
+        for script_element in soup.select("script"):
+            script_text = script_element.text.strip()
+            if not script_text.startswith("window._preloads"):
+                continue
+            # pos1 = re.search(r'window._preloads\s*=\s*JSON\.parse\(', script_text).span()[1]
+            pos1 = script_text.find("(") + 1
+            pos2 = script_text.rfind(")")
+            window_preloads = json.loads(json.loads(script_text[pos1:pos2]))
+            break
+        assert window_preloads, f"not found <script>window._preloads...</script> at {url!r}"
+        return window_preloads
+
+    def count_comments(self, comments_preloads):
+
+        def count_comments_inner(comment):
+            res = 1
+            for child_comment in comment["children"]:
+                res += count_comments_inner(child_comment)
+            return res
+
+        res = 0
+        for comment in comments_preloads["initialComments"]:
+            res += count_comments_inner(comment)
+        return res
+
+    def render_comments_html(self, comments_preloads):
+
+        def render_comment_body(body):
+            body = body.strip()
+            body = "<p>" + body + "</p>"
+            body = body.replace("\n", "</p>\n<p>")
+            # TODO more?
+            return body
+
+        def render_comments_html_inner(comment, buf):
+            assert comment["type"] == "comment", f'unexpected comment type: {comment["type"]!r}'
+            buf.write(f'<details class="comment" id="{comment["id"]}" open>\n')
+            buf.write(f'<summary>\n')
+
+            # NOTE user IDs are constant, user handles are variable
+            # when i change my user handle
+            # then other users can use my old user handle
+            buf.write(f'<a class="user" href="https://substack.com/profile/{comment["user_id"]}">')
+            buf.write(comment["name"]) # human-readable username
+            buf.write('</a>\n')
+
+            other_pub = comment["metadata"].get("author_on_other_pub")
+            if other_pub:
+                # NOTE publication handles are quasi-constant:
+                # when i change my publication handle
+                # then other users cannot use my old publication handle
+                # NOTE "Changing your publication's subdomain
+                # does not automatically set up a redirect from the old subdomain to the new one."
+                buf.write(f'(<a class="pub" pub-id="{other_pub["id"]}" href="{other_pub["base_url"]}">')
+                buf.write(other_pub["name"])
+                buf.write('</a>)\n')
+
+            buf.write(comment["date"] + '\n') # "2025-05-17T06:51:39.485Z"
+
+            for reaction, reaction_count in comment["reactions"].items():
+                if reaction_count == 0: continue
+                buf.write(reaction + str(reaction_count) + '\n') # "❤123"
+                # buf.write(str(reaction_count) + reaction + '\n') # "123❤"
+
+            buf.write('</summary>\n')
+
+            buf.write('<blockquote>\n')
+            buf.write('\n')
+            buf.write(render_comment_body(comment["body"]) + '\n')
+
+            for child_comment in comment["children"]:
+                buf.write('\n')
+                render_comments_html_inner(child_comment, buf)
+            buf.write('</blockquote>\n')
+
+            buf.write('</details>\n')
+            buf.write('\n')
+
+        buf = io.StringIO()
+        # NOTE the name "initial" is misleading. all comments are stored in this array
+        # NOTE comments are sorted by likes
+        for comment in comments_preloads["initialComments"]:
+            render_comments_html_inner(comment, buf)
+        return buf.getvalue()
 
     @abstractmethod
     def get_url_soup(self, url: str) -> str:
@@ -412,6 +504,37 @@ async def scrape_posts(self, num_posts_to_scrape: int = 0) -> None:
                         with tqdm(total=total_images, desc=f"Downloading images for {post_slug}", leave=False) as img_pbar:
                             md = await self.process_markdown_images(md, self.writer_name, post_slug, img_pbar)
 
+                    comments_html = None
+                    comments_num = None
+                    if not self.args.no_comments:
+                        comments_url = url + "/comments"
+                        # comments_url = "https://willstorr.substack.com/p/scamming-substack/comments" # test
+                        comments_soup = await self.get_url_soup(comments_url)
+                        comments_preloads = await self.get_window_preloads(comments_soup)
+                        if 0:
+                            # debug
+                            # TODO add option to write the original "preloads" data to json files
+                            with open("comments_preloads.json", "w") as f:
+                                json.dump(comments_preloads, f, indent=2)
+                            raise 5
+                        comments_num = self.count_comments(comments_preloads)
+                        if comments_num > 0:
+                            comments_html = self.render_comments_html(comments_preloads)
+                            comments_html = (
+                                '\n\n' +
+                                '<hr>\n' +
+                                # this can collide with other elements with id="comments"
+                                # '<section id="comments">\n' +
+                                '<section class="comments">\n' +
+                                '<h2>Comments</h2>\n' +
+                                '<details open>\n' +
+                                f'<summary>{comments_num} comments</summary>\n' +
+                                comments_html + '\n' +
+                                '</details>'
+                                '</section>'
+                            )
+                            md += comments_html
+
                     self.save_to_file(md_filepath, md)
 
                     # Convert markdown to HTML and save
@@ -422,6 +545,7 @@ async def scrape_posts(self, num_posts_to_scrape: int = 0) -> None:
                         "title": title,
                         "subtitle": subtitle,
                         "like_count": like_count,
+                        "comment_count": comments_num,
                         "date": date,
                         "file_link": md_filepath,
                         "html_link": html_filepath
@@ -772,6 +896,11 @@ def parse_args() -> argparse.Namespace:
         action="store_true",
         help=f"Do not download images.",
     )
+    parser.add_argument(
+        "--no-comments", # args.no_comments
+        action="store_true",
+        help=f"Do not download comments.",
+    )
 
     return parser.parse_args()
 

From 591fa86b49e6f38d802c741e3d8b8f10365c2cf8 Mon Sep 17 00:00:00 2001
From: Milan Hauth <milahu@gmail.com>
Date: Sun, 28 Dec 2025 22:37:11 +0100
Subject: [PATCH 09/28] handle removed comments

---
 src/substack2markdown/substack_scraper.py | 32 ++++++++++++++++++++---
 1 file changed, 28 insertions(+), 4 deletions(-)

diff --git a/src/substack2markdown/substack_scraper.py b/src/substack2markdown/substack_scraper.py
index d2f2ad3a..27890458 100644
--- a/src/substack2markdown/substack_scraper.py
+++ b/src/substack2markdown/substack_scraper.py
@@ -413,9 +413,19 @@ def render_comments_html_inner(comment, buf):
             # NOTE user IDs are constant, user handles are variable
             # when i change my user handle
             # then other users can use my old user handle
-            buf.write(f'<a class="user" href="https://substack.com/profile/{comment["user_id"]}">')
-            buf.write(comment["name"]) # human-readable username
-            buf.write('</a>\n')
+            if not comment["user_id"] is None:
+                buf.write(f'<a class="user" href="https://substack.com/profile/{comment["user_id"]}">')
+
+            if not comment["name"] is None:
+                buf.write(comment["name"]) # human-readable username
+            else:
+                # Comment removed
+                buf.write("null")
+
+            if not comment["user_id"] is None:
+               buf.write('</a>\n')
+            else:
+               buf.write('\n')
 
             other_pub = comment["metadata"].get("author_on_other_pub")
             if other_pub:
@@ -439,7 +449,21 @@ def render_comments_html_inner(comment, buf):
 
             buf.write('<blockquote>\n')
             buf.write('\n')
-            buf.write(render_comment_body(comment["body"]) + '\n')
+
+            if comment["body"] is None:
+                # Comment removed
+                status = comment.get("status")
+                if status is None:
+                    buf.write('(Comment removed)\n')
+                else:
+                    # "moderator_removed", ...
+                    buf.write('(status:' + status + ')\n')
+                # TODO comment["bans"]
+                # TODO comment["suppressed"]
+                # TODO comment["user_banned"]
+                # TODO comment["user_banned_for_comment"]
+            else:
+                buf.write(render_comment_body(comment["body"]) + '\n')
 
             for child_comment in comment["children"]:
                 buf.write('\n')

From 1458d78ea4828be288c73abdf5462641047813b2 Mon Sep 17 00:00:00 2001
From: Milan Hauth <milahu@gmail.com>
Date: Sun, 28 Dec 2025 22:37:48 +0100
Subject: [PATCH 10/28] add debug comment

---
 src/substack2markdown/substack_scraper.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/substack2markdown/substack_scraper.py b/src/substack2markdown/substack_scraper.py
index 27890458..2a1c2727 100644
--- a/src/substack2markdown/substack_scraper.py
+++ b/src/substack2markdown/substack_scraper.py
@@ -578,6 +578,7 @@ async def scrape_posts(self, num_posts_to_scrape: int = 0) -> None:
                     print(f"File already exists: {md_filepath}")
             except Exception as e:
                 print(f"Error scraping post: {e}")
+                # raise e # debug
             count += 1
             if num_posts_to_scrape != 0 and count == num_posts_to_scrape:
                 break

From 07e4c1d1581d81d63ff6ac7624da03fa513047d7 Mon Sep 17 00:00:00 2001
From: Milan Hauth <milahu@gmail.com>
Date: Sun, 28 Dec 2025 23:26:21 +0100
Subject: [PATCH 11/28] write JSON files

---
 src/substack2markdown/substack_scraper.py | 35 +++++++++++++++++++----
 1 file changed, 29 insertions(+), 6 deletions(-)

diff --git a/src/substack2markdown/substack_scraper.py b/src/substack2markdown/substack_scraper.py
index 2a1c2727..d31ca8dd 100644
--- a/src/substack2markdown/substack_scraper.py
+++ b/src/substack2markdown/substack_scraper.py
@@ -31,6 +31,7 @@
 BASE_MD_DIR: str = "substack_md_files"  # Name of the directory we'll save the .md essay files
 BASE_HTML_DIR: str = "substack_html_pages"  # Name of the directory we'll save the .html essay files
 BASE_IMAGE_DIR: str = "substack_images"
+BASE_JSON_DIR: str = "substack_json"
 ASSETS_DIR: str = os.path.dirname(__file__) + "/assets"
 HTML_TEMPLATE: str = "author_template.html"  # HTML template to use for the author page
 JSON_DATA_DIR: str = "data"
@@ -132,6 +133,8 @@ def __init__(self, args, base_substack_url: str, md_save_dir: str, html_save_dir
         self.md_save_dir: str = md_save_dir
         self.html_save_dir: str = f"{html_save_dir}/{self.writer_name}"
 
+        self.args.json_directory += f"/{self.writer_name}"
+
         if not os.path.exists(md_save_dir):
             os.makedirs(md_save_dir)
             print(f"Created md directory {md_save_dir}")
@@ -142,6 +145,9 @@ def __init__(self, args, base_substack_url: str, md_save_dir: str, html_save_dir
         if not self.args.no_images:
             os.makedirs(self.args.image_directory, exist_ok=True)
 
+        if not self.args.no_json:
+            os.makedirs(self.args.json_directory, exist_ok=True)
+
         self.keywords: List[str] = ["about", "archive", "podcast"]
         self.post_urls: List[str] = self.get_all_post_urls()
 
@@ -535,12 +541,11 @@ async def scrape_posts(self, num_posts_to_scrape: int = 0) -> None:
                         # comments_url = "https://willstorr.substack.com/p/scamming-substack/comments" # test
                         comments_soup = await self.get_url_soup(comments_url)
                         comments_preloads = await self.get_window_preloads(comments_soup)
-                        if 0:
-                            # debug
-                            # TODO add option to write the original "preloads" data to json files
-                            with open("comments_preloads.json", "w") as f:
-                                json.dump(comments_preloads, f, indent=2)
-                            raise 5
+                        if not self.args.no_json:
+                            json_filename = self.get_filename_from_url(url, filetype=".comments.json")
+                            json_filepath = os.path.join(self.args.json_directory, json_filename)
+                            _json = json.dumps(comments_preloads, ensure_ascii=False, separators=(',', ':'))
+                            self.save_to_file(json_filepath, _json)
                         comments_num = self.count_comments(comments_preloads)
                         if comments_num > 0:
                             comments_html = self.render_comments_html(comments_preloads)
@@ -561,6 +566,13 @@ async def scrape_posts(self, num_posts_to_scrape: int = 0) -> None:
 
                     self.save_to_file(md_filepath, md)
 
+                    if not self.args.no_json:
+                        post_preloads = await self.get_window_preloads(soup)
+                        json_filename = self.get_filename_from_url(url, filetype=".post.json")
+                        json_filepath = os.path.join(self.args.json_directory, json_filename)
+                        _json = json.dumps(post_preloads, ensure_ascii=False, separators=(',', ':'))
+                        self.save_to_file(json_filepath, _json)
+
                     # Convert markdown to HTML and save
                     html_content = self.md_to_html(md)
                     self.save_to_html_file(html_filepath, html_content)
@@ -916,6 +928,12 @@ def parse_args() -> argparse.Namespace:
         default=BASE_IMAGE_DIR,
         help=f"The directory to save scraped image files. Default: {BASE_IMAGE_DIR!r}",
     )
+    parser.add_argument(
+        "--json-directory", # args.json_directory
+        type=str,
+        default=BASE_JSON_DIR,
+        help=f"The directory to save scraped JSON files. Default: {BASE_JSON_DIR!r}",
+    )
     parser.add_argument(
         "--no-images", # args.no_images
         action="store_true",
@@ -926,6 +944,11 @@ def parse_args() -> argparse.Namespace:
         action="store_true",
         help=f"Do not download comments.",
     )
+    parser.add_argument(
+        "--no-json", # args.no_json
+        action="store_true",
+        help=f"Do not write JSON files.",
+    )
 
     return parser.parse_args()
 

From f17ad508fff59408888cd0f3455a22382a698f5c Mon Sep 17 00:00:00 2001
From: Milan Hauth <milahu@gmail.com>
Date: Mon, 29 Dec 2025 15:29:46 +0100
Subject: [PATCH 12/28] use output filepath format strings

---
 src/substack2markdown/substack_scraper.py | 325 +++++++++++++---------
 1 file changed, 190 insertions(+), 135 deletions(-)

diff --git a/src/substack2markdown/substack_scraper.py b/src/substack2markdown/substack_scraper.py
index d31ca8dd..79a3c9c7 100644
--- a/src/substack2markdown/substack_scraper.py
+++ b/src/substack2markdown/substack_scraper.py
@@ -14,6 +14,7 @@
 import asyncio
 import atexit
 import signal
+import string
 
 import html2text
 import markdown
@@ -34,9 +35,15 @@
 BASE_JSON_DIR: str = "substack_json"
 ASSETS_DIR: str = os.path.dirname(__file__) + "/assets"
 HTML_TEMPLATE: str = "author_template.html"  # HTML template to use for the author page
-JSON_DATA_DIR: str = "data"
 NUM_POSTS_TO_SCRAPE: int = 3  # Set to 0 if you want all posts
-
+DEFAULT_OUTPUT_DIRECTORY_FORMAT = "$publication_domain"
+DEFAULT_IMAGE_PATH_FORMAT = "p/$post_slug/images/$image_filename"
+DEFAULT_MD_PATH_FORMAT = "p/$post_slug/readme.md"
+DEFAULT_HTML_PATH_FORMAT = "p/$post_slug/index.html"
+DEFAULT_POSTS_HTML_PATH_FORMAT = "index.html"
+DEFAULT_POSTS_JSON_PATH_FORMAT = "posts.json"
+DEFAULT_POST_JSON_PATH_FORMAT = "p/$post_slug/post.json"
+DEFAULT_COMMENTS_JSON_PATH_FORMAT = "p/$post_slug/comments.json"
 
 def count_images_in_markdown(md_content: str) -> int:
     """Count number of Substack CDN image URLs in markdown content."""
@@ -80,37 +87,6 @@ def extract_main_part(url: str) -> str:
     # present
 
 
-def generate_html_file(args, author_name: str) -> None:
-    """
-    Generates a HTML file for the given author.
-    """
-    if not os.path.exists(args.html_directory):
-        os.makedirs(args.html_directory)
-
-    # Read JSON data
-    json_path = os.path.join(JSON_DATA_DIR, f'{author_name}.json')
-    with open(json_path, 'r', encoding='utf-8') as file:
-        essays_data = json.load(file)
-
-    # Convert JSON data to a JSON string for embedding
-    embedded_json_data = json.dumps(essays_data, ensure_ascii=False, indent=4)
-
-    with open(args.author_template, 'r', encoding='utf-8') as file:
-        html_template = file.read()
-
-    # Insert the JSON string into the script tag in the HTML template
-    html_with_data = html_template.replace('<!-- AUTHOR_NAME -->', author_name).replace(
-        '<script type="application/json" id="essaysData"></script>',
-        f'<script type="application/json" id="essaysData">{embedded_json_data}</script>'
-    )
-    html_with_author = html_with_data.replace('author_name', author_name)
-
-    # Write the modified HTML to a new file
-    html_output_path = os.path.join(args.html_directory, f'{author_name}.html')
-    with open(html_output_path, 'w', encoding='utf-8') as file:
-        file.write(html_with_author)
-
-
 class BaseSubstackScraper(ABC):
     def __await__(self):
         return self._async_init().__await__()
@@ -121,32 +97,28 @@ async def __aenter__(self):
     async def __aexit__(self, exc_type, exc, tb):
         pass
 
-    def __init__(self, args, base_substack_url: str, md_save_dir: str, html_save_dir: str):
-        if not base_substack_url.endswith("/"):
-            base_substack_url += "/"
+    def __init__(self, args):
         self.args = args
-        self.base_substack_url: str = base_substack_url
-
-        self.writer_name: str = extract_main_part(base_substack_url)
-        md_save_dir: str = f"{md_save_dir}/{self.writer_name}"
+        if not self.args.url.endswith("/"):
+            self.args.url += "/"
 
-        self.md_save_dir: str = md_save_dir
-        self.html_save_dir: str = f"{html_save_dir}/{self.writer_name}"
+        self.publication_handle: str = extract_main_part(self.args.url)
 
-        self.args.json_directory += f"/{self.writer_name}"
+        self.output_directory_template = string.Template(self.args.output_directory_format)
 
-        if not os.path.exists(md_save_dir):
-            os.makedirs(md_save_dir)
-            print(f"Created md directory {md_save_dir}")
-        if not os.path.exists(self.html_save_dir):
-            os.makedirs(self.html_save_dir)
-            print(f"Created html directory {self.html_save_dir}")
+        # all these paths are relative to output_directory
+        self.md_path_template = string.Template(self.args.md_path_format)
+        self.html_path_template = string.Template(self.args.html_path_format)
+        self.image_path_template = string.Template(self.args.image_path_format)
+        self.posts_html_path_template = string.Template(self.args.posts_html_path_format)
+        self.posts_json_path_template = string.Template(self.args.posts_json_path_format)
+        self.post_json_path_template = string.Template(self.args.post_json_path_format)
+        self.comments_json_path_template = string.Template(self.args.comments_json_path_format)
 
-        if not self.args.no_images:
-            os.makedirs(self.args.image_directory, exist_ok=True)
-
-        if not self.args.no_json:
-            os.makedirs(self.args.json_directory, exist_ok=True)
+        self.format_vars = {
+            "publication_handle": self.publication_handle,
+            "publication_domain": f"{self.publication_handle}.substack.com",
+        }
 
         self.keywords: List[str] = ["about", "archive", "podcast"]
         self.post_urls: List[str] = self.get_all_post_urls()
@@ -168,7 +140,7 @@ def fetch_urls_from_sitemap(self) -> List[str]:
         """
         Fetches URLs from sitemap.xml.
         """
-        sitemap_url = f"{self.base_substack_url}sitemap.xml"
+        sitemap_url = f"{self.args.url}sitemap.xml"
         response = requests.get(sitemap_url)
 
         if not response.ok:
@@ -184,7 +156,7 @@ def fetch_urls_from_feed(self) -> List[str]:
         Fetches URLs from feed.xml.
         """
         print('Falling back to feed.xml. This will only contain up to the 22 most recent posts.')
-        feed_url = f"{self.base_substack_url}feed.xml"
+        feed_url = f"{self.args.url}feed.xml"
         response = requests.get(feed_url)
 
         if not response.ok:
@@ -258,7 +230,9 @@ def save_to_html_file(self, filepath: str, content: str) -> None:
 
         # Calculate the relative path from the HTML file to the CSS file
         html_dir = os.path.dirname(filepath)
-        css_path = os.path.relpath(self.args.assets_dir + "/css/essay-styles.css", html_dir)
+        css_path = self.args.assets_dir + "/css/essay-styles.css"
+        if not os.path.isabs(css_path):
+            css_path = os.path.relpath(css_path, html_dir)
         css_path = css_path.replace("\\", "/")  # Ensure forward slashes for web paths
 
         html_content = f"""
@@ -490,35 +464,55 @@ def render_comments_html_inner(comment, buf):
     def get_url_soup(self, url: str) -> str:
         raise NotImplementedError
 
-    def save_essays_data_to_json(self, essays_data: list) -> None:
+    def save_posts_data_json(self, posts_data: list) -> None:
         """
         Saves essays data to a JSON file for a specific author.
         """
-        data_dir = os.path.join(JSON_DATA_DIR)
-        if not os.path.exists(data_dir):
-            os.makedirs(data_dir)
-
-        json_path = os.path.join(data_dir, f'{self.writer_name}.json')
-        if os.path.exists(json_path):
-            with open(json_path, 'r', encoding='utf-8') as file:
+        posts_json_path = os.path.join(
+            self.format_vars["output_directory"],
+            self.posts_json_path_template.substitute(self.format_vars)
+        )
+        os.makedirs(os.path.dirname(posts_json_path), exist_ok=True)
+        if os.path.exists(posts_json_path):
+            with open(posts_json_path, 'r', encoding='utf-8') as file:
                 existing_data = json.load(file)
-            essays_data = existing_data + [data for data in essays_data if data not in existing_data]
-        with open(json_path, 'w', encoding='utf-8') as f:
-            json.dump(essays_data, f, ensure_ascii=False, indent=4)
+            # remove duplicates from existing_data
+            new_post_ids = set(map(lambda p: p["id"], posts_data))
+            existing_data = [p for p in posts_data if p["id"] not in new_post_ids]
+            posts_data = existing_data + posts_data
+        # sort by post_id, descending
+        posts_data.sort(key=lambda p: -1*p["id"])
+        with open(posts_json_path, 'w', encoding='utf-8') as f:
+            json.dump(posts_data, f, ensure_ascii=False, separators=(',', ':'))
 
     async def scrape_posts(self, num_posts_to_scrape: int = 0) -> None:
         """
         Iterates over all posts and saves them as markdown and html files
         """
-        essays_data = []
+        posts_data = []
         count = 0
         total = num_posts_to_scrape if num_posts_to_scrape != 0 else len(self.post_urls)
         for url in tqdm(self.post_urls, total=total):
             try:
-                md_filename = self.get_filename_from_url(url, filetype=".md")
-                html_filename = self.get_filename_from_url(url, filetype=".html")
-                md_filepath = os.path.join(self.md_save_dir, md_filename)
-                html_filepath = os.path.join(self.html_save_dir, html_filename)
+                post_slug = url.split("/")[-1]
+                self.format_vars["post_slug"] = post_slug
+
+                output_directory = self.output_directory_template.substitute(self.format_vars)
+                self.format_vars["output_directory"] = output_directory
+
+                md_filepath = os.path.join(
+                    output_directory,
+                    self.md_path_template.substitute(self.format_vars)
+                )
+                self.format_vars["md_filepath"] = md_filepath
+                self.format_vars["md_directory"] = os.path.dirname(md_filepath)
+
+                html_filepath = os.path.join(
+                    output_directory,
+                    self.html_path_template.substitute(self.format_vars)
+                )
+                self.format_vars["html_filepath"] = html_filepath
+                self.format_vars["html_directory"] = os.path.dirname(html_filepath)
 
                 # if not os.path.exists(md_filepath):
                 if True:
@@ -527,12 +521,14 @@ async def scrape_posts(self, num_posts_to_scrape: int = 0) -> None:
                         total += 1
                         continue
                     title, subtitle, like_count, date, md = self.extract_post_data(soup)
+                    post_preloads = await self.get_window_preloads(soup)
+
+                    post_id = post_preloads["post"]["id"]
 
                     if not self.args.no_images:
                         total_images = count_images_in_markdown(md)
-                        post_slug = get_post_slug(url)
                         with tqdm(total=total_images, desc=f"Downloading images for {post_slug}", leave=False) as img_pbar:
-                            md = await self.process_markdown_images(md, self.writer_name, post_slug, img_pbar)
+                            md = await self.process_markdown_images(md, img_pbar)
 
                     comments_html = None
                     comments_num = None
@@ -542,8 +538,10 @@ async def scrape_posts(self, num_posts_to_scrape: int = 0) -> None:
                         comments_soup = await self.get_url_soup(comments_url)
                         comments_preloads = await self.get_window_preloads(comments_soup)
                         if not self.args.no_json:
-                            json_filename = self.get_filename_from_url(url, filetype=".comments.json")
-                            json_filepath = os.path.join(self.args.json_directory, json_filename)
+                            json_filepath = os.path.join(
+                                output_directory,
+                                self.comments_json_path_template.substitute(self.format_vars)
+                            )
                             _json = json.dumps(comments_preloads, ensure_ascii=False, separators=(',', ':'))
                             self.save_to_file(json_filepath, _json)
                         comments_num = self.count_comments(comments_preloads)
@@ -567,9 +565,10 @@ async def scrape_posts(self, num_posts_to_scrape: int = 0) -> None:
                     self.save_to_file(md_filepath, md)
 
                     if not self.args.no_json:
-                        post_preloads = await self.get_window_preloads(soup)
-                        json_filename = self.get_filename_from_url(url, filetype=".post.json")
-                        json_filepath = os.path.join(self.args.json_directory, json_filename)
+                        json_filepath = os.path.join(
+                            output_directory,
+                            self.post_json_path_template.substitute(self.format_vars)
+                        )
                         _json = json.dumps(post_preloads, ensure_ascii=False, separators=(',', ':'))
                         self.save_to_file(json_filepath, _json)
 
@@ -577,7 +576,8 @@ async def scrape_posts(self, num_posts_to_scrape: int = 0) -> None:
                     html_content = self.md_to_html(md)
                     self.save_to_html_file(html_filepath, html_content)
 
-                    essays_data.append({
+                    posts_data.append({
+                        "id": post_id,
                         "title": title,
                         "subtitle": subtitle,
                         "like_count": like_count,
@@ -594,8 +594,51 @@ async def scrape_posts(self, num_posts_to_scrape: int = 0) -> None:
             count += 1
             if num_posts_to_scrape != 0 and count == num_posts_to_scrape:
                 break
-        self.save_essays_data_to_json(essays_data=essays_data)
-        generate_html_file(self.args, author_name=self.writer_name)
+        self.save_posts_data_json(posts_data)
+        self.generate_main_html_file()
+
+    def generate_main_html_file(self) -> None:
+        """
+        Generates a HTML file for the given author.
+        """
+        # Read JSON data
+        posts_json_path = os.path.join(
+            self.format_vars["output_directory"],
+            self.posts_json_path_template.substitute(self.format_vars)
+        )
+        with open(posts_json_path, 'r', encoding='utf-8') as file:
+            posts_data = json.load(file)
+
+        # Convert JSON data to a JSON string for embedding
+        embedded_json_data = json.dumps(posts_data, ensure_ascii=False, separators=(',', ':'))
+
+        html_output_path = os.path.join(
+            self.format_vars["output_directory"],
+            self.posts_html_path_template.substitute(self.format_vars)
+        )
+
+        with open(self.args.author_template, 'r', encoding='utf-8') as file:
+            html_template = file.read()
+
+        html_with_data = html_template
+
+        # patch assets path
+        assets_path = self.args.assets_dir
+        if not os.path.isabs(assets_path):
+            assets_path = os.path.relpath(assets_path, os.path.dirname(html_output_path))
+        html_with_data = html_with_data.replace('"../assets', f'"{assets_path}')
+
+        html_with_data = html_with_data.replace('<!-- AUTHOR_NAME -->', self.publication_handle)
+
+        # Insert the JSON string into the script tag in the HTML template
+        html_with_data = html_with_data.replace(
+            '<script type="application/json" id="essaysData"></script>',
+            f'<script type="application/json" id="essaysData">{embedded_json_data}</script>'
+        )
+
+        # Write the modified HTML to a new file
+        with open(html_output_path, 'w', encoding='utf-8') as file:
+            file.write(html_with_data)
 
     async def download_image(
             self,
@@ -624,12 +667,10 @@ async def download_image(
     async def process_markdown_images(
             self,
             md_content: str,
-            author: str,
-            post_slug: str,
             pbar=None
         ) -> str:
         """Process markdown content to download images and update references."""
-        image_dir = Path(self.args.image_directory) / author / post_slug
+        output_directory = self.format_vars["output_directory"]
         # [![](https://substackcdn.com/image/fetch/x.png)](https://substackcdn.com/image/fetch/x.png)
         pattern = re.compile(r'\(https://substackcdn\.com/image/fetch/[^\s\)]+\)')
         buf = io.StringIO()
@@ -638,10 +679,20 @@ async def process_markdown_images(
             buf.write(md_content[last_end:match.start()])
             url = match.group(0).strip("()")
             filename = sanitize_image_filename(url)
-            save_path = image_dir / filename
+            format_vars = {
+                **self.format_vars,
+                "image_filename": filename,
+            }
+            save_path = Path(os.path.join(
+                output_directory,
+                self.image_path_template.substitute(format_vars)
+            ))
             if not save_path.exists():
                 await self.download_image(url, save_path, pbar)
-            rel_path = os.path.relpath(save_path, Path(self.args.directory) / author)
+            md_directory = self.format_vars["md_directory"]
+            rel_path = save_path
+            if not os.path.isabs(rel_path):
+                rel_path = os.path.relpath(save_path, md_directory)
             buf.write(f"({rel_path})")
             last_end = match.end()
         buf.write(md_content[last_end:])
@@ -649,8 +700,8 @@ async def process_markdown_images(
 
 
 class SubstackScraper(BaseSubstackScraper):
-    def __init__(self, args, base_substack_url: str, md_save_dir: str, html_save_dir: str):
-        super().__init__(args, base_substack_url, md_save_dir, html_save_dir)
+    def __init__(self, args):
+        super().__init__(args, self.args.url)
 
     def get_url_soup(self, url: str) -> Optional[BeautifulSoup]:
         """
@@ -668,17 +719,8 @@ def get_url_soup(self, url: str) -> Optional[BeautifulSoup]:
 
 
 class PremiumSubstackScraper(BaseSubstackScraper):
-    def __init__(
-        self,
-        args,
-        base_substack_url: str,
-        md_save_dir: str,
-        html_save_dir: str,
-        headless: bool = False,
-        chromium_path: str = '',
-        user_agent: str = ''
-    ) -> None:
-        super().__init__(args, base_substack_url, md_save_dir, html_save_dir)
+    def __init__(self, args) -> None:
+        super().__init__(args)
 
         self.driver = None
 
@@ -698,13 +740,13 @@ def exit_handler(signum, frame):
 
         options = webdriver.ChromeOptions()
         self.chrome_options = options
-        if headless:
+        if self.args.headless:
             # modern headless flag (works better with recent Chromium)
             options.add_argument("--headless=new")
-        if chromium_path:
-            options.binary_location = chromium_path
-        if user_agent:
-            options.add_argument(f"user-agent={user_agent}")
+        if self.args.chromium_path:
+            options.binary_location = self.args.chromium_path
+        if self.args.user_agent:
+            options.add_argument(f"user-agent={self.args.user_agent}")
 
     async def _async_init(self):
         self._loop = asyncio.get_running_loop()
@@ -868,13 +910,6 @@ def parse_args() -> argparse.Namespace:
         default=BASE_SUBSTACK_URL,
         help="The base URL of the Substack site to scrape."
     )
-    parser.add_argument(
-        "-d",
-        "--directory", # args.directory
-        type=str,
-        default=BASE_MD_DIR,
-        help="The directory to save scraped posts."
-    )
     parser.add_argument(
         "-n",
         "--number", # args.number
@@ -917,22 +952,53 @@ def parse_args() -> argparse.Namespace:
         "passing captcha in headless mode",
     )
     parser.add_argument(
-        "--html-directory", # args.html_directory
+        "--output-directory-format", # args.output_directory_format
+        type=str,
+        default=DEFAULT_OUTPUT_DIRECTORY_FORMAT,
+        # all relative output file paths are relative to this directory
+        help=f"The file path format of the directory to save output files. Default: {DEFAULT_OUTPUT_DIRECTORY_FORMAT!r}",
+    )
+    parser.add_argument(
+        "--md-path-format", # args.md_path_format
+        type=str,
+        default=DEFAULT_MD_PATH_FORMAT,
+        help=f"The file path format to save scraped posts as Markdown files. Default: {DEFAULT_MD_PATH_FORMAT!r}",
+    )
+    parser.add_argument(
+        "--html-path-format", # args.html_path_format
+        type=str,
+        default=DEFAULT_HTML_PATH_FORMAT,
+        help=f"The file path format to save scraped posts as HTML files. Default: {DEFAULT_HTML_PATH_FORMAT!r}",
+    )
+    parser.add_argument(
+        "--image-path-format", # args.image_path_format
+        type=str,
+        default=DEFAULT_IMAGE_PATH_FORMAT,
+        help=f"The file path format to save scraped image files. Default: {DEFAULT_IMAGE_PATH_FORMAT!r}",
+    )
+    parser.add_argument(
+        "--posts-html-path-format", # args.posts_html_path_format
+        type=str,
+        default=DEFAULT_POSTS_HTML_PATH_FORMAT,
+        help=f"The file path format to save an index of scraped posts as HTML file. Default: {DEFAULT_POSTS_HTML_PATH_FORMAT!r}",
+    )
+    parser.add_argument(
+        "--posts-json-path-format", # args.posts_json_path_format
         type=str,
-        default=BASE_HTML_DIR,
-        help=f"The directory to save scraped posts as HTML files. Default: {BASE_HTML_DIR!r}",
+        default=DEFAULT_POSTS_JSON_PATH_FORMAT,
+        help=f"The file path format to save metadata of scraped posts as JSON file. Default: {DEFAULT_POSTS_JSON_PATH_FORMAT!r}",
     )
     parser.add_argument(
-        "--image-directory", # args.image_directory
+        "--post-json-path-format", # args.post_json_path_format
         type=str,
-        default=BASE_IMAGE_DIR,
-        help=f"The directory to save scraped image files. Default: {BASE_IMAGE_DIR!r}",
+        default=DEFAULT_POST_JSON_PATH_FORMAT,
+        help=f"The file path format to save scraped posts as JSON files. Default: {DEFAULT_POST_JSON_PATH_FORMAT!r}",
     )
     parser.add_argument(
-        "--json-directory", # args.json_directory
+        "--comments-json-path-format", # args.comments_json_path_format
         type=str,
-        default=BASE_JSON_DIR,
-        help=f"The directory to save scraped JSON files. Default: {BASE_JSON_DIR!r}",
+        default=DEFAULT_COMMENTS_JSON_PATH_FORMAT,
+        help=f"The file path format to save scraped comments as JSON files. Default: {DEFAULT_COMMENTS_JSON_PATH_FORMAT!r}",
     )
     parser.add_argument(
         "--no-images", # args.no_images
@@ -971,20 +1037,9 @@ async def async_main():
 
     if True:
         if args.premium:
-            scraper = await PremiumSubstackScraper(
-                args=args,
-                base_substack_url=args.url,
-                headless=args.headless,
-                md_save_dir=args.directory,
-                html_save_dir=args.html_directory
-            )
+            scraper = await PremiumSubstackScraper(args)
         else:
-            scraper = await SubstackScraper(
-                args=args,
-                base_substack_url=args.url,
-                md_save_dir=args.directory,
-                html_save_dir=args.html_directory
-            )
+            scraper = await SubstackScraper(args)
 
         await scraper.scrape_posts(args.number)
         await scraper.close()

From 38875f844e689c5f865c862c3829bdcae94c0e79 Mon Sep 17 00:00:00 2001
From: Milan Hauth <milahu@gmail.com>
Date: Mon, 29 Dec 2025 16:29:45 +0100
Subject: [PATCH 13/28] add json_dump_kwargs

---
 src/substack2markdown/substack_scraper.py | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/src/substack2markdown/substack_scraper.py b/src/substack2markdown/substack_scraper.py
index 79a3c9c7..dd151b9e 100644
--- a/src/substack2markdown/substack_scraper.py
+++ b/src/substack2markdown/substack_scraper.py
@@ -45,6 +45,12 @@
 DEFAULT_POST_JSON_PATH_FORMAT = "p/$post_slug/post.json"
 DEFAULT_COMMENTS_JSON_PATH_FORMAT = "p/$post_slug/comments.json"
 
+json_dump_kwargs = dict(
+    ensure_ascii=False,
+    indent=0,
+    separators=(',', ':'),
+)
+
 def count_images_in_markdown(md_content: str) -> int:
     """Count number of Substack CDN image URLs in markdown content."""
     # [![](https://substackcdn.com/image/fetch/x.png)](https://substackcdn.com/image/fetch/x.png)
@@ -483,7 +489,7 @@ def save_posts_data_json(self, posts_data: list) -> None:
         # sort by post_id, descending
         posts_data.sort(key=lambda p: -1*p["id"])
         with open(posts_json_path, 'w', encoding='utf-8') as f:
-            json.dump(posts_data, f, ensure_ascii=False, separators=(',', ':'))
+            json.dump(posts_data, f, **json_dump_kwargs)
 
     async def scrape_posts(self, num_posts_to_scrape: int = 0) -> None:
         """
@@ -542,7 +548,7 @@ async def scrape_posts(self, num_posts_to_scrape: int = 0) -> None:
                                 output_directory,
                                 self.comments_json_path_template.substitute(self.format_vars)
                             )
-                            _json = json.dumps(comments_preloads, ensure_ascii=False, separators=(',', ':'))
+                            _json = json.dumps(comments_preloads, **json_dump_kwargs)
                             self.save_to_file(json_filepath, _json)
                         comments_num = self.count_comments(comments_preloads)
                         if comments_num > 0:
@@ -569,7 +575,7 @@ async def scrape_posts(self, num_posts_to_scrape: int = 0) -> None:
                             output_directory,
                             self.post_json_path_template.substitute(self.format_vars)
                         )
-                        _json = json.dumps(post_preloads, ensure_ascii=False, separators=(',', ':'))
+                        _json = json.dumps(post_preloads, **json_dump_kwargs)
                         self.save_to_file(json_filepath, _json)
 
                     # Convert markdown to HTML and save
@@ -610,7 +616,7 @@ def generate_main_html_file(self) -> None:
             posts_data = json.load(file)
 
         # Convert JSON data to a JSON string for embedding
-        embedded_json_data = json.dumps(posts_data, ensure_ascii=False, separators=(',', ':'))
+        embedded_json_data = json.dumps(posts_data, **json_dump_kwargs)
 
         html_output_path = os.path.join(
             self.format_vars["output_directory"],

From 2ef534c6961c1be88df4f8d8bd14df1b467593f9 Mon Sep 17 00:00:00 2001
From: Milan Hauth <milahu@gmail.com>
Date: Tue, 30 Dec 2025 12:08:39 +0100
Subject: [PATCH 14/28] fix class SubstackScraper

---
 src/substack2markdown/substack_scraper.py | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/src/substack2markdown/substack_scraper.py b/src/substack2markdown/substack_scraper.py
index dd151b9e..835ca5bd 100644
--- a/src/substack2markdown/substack_scraper.py
+++ b/src/substack2markdown/substack_scraper.py
@@ -101,6 +101,9 @@ async def __aenter__(self):
         return await self
 
     async def __aexit__(self, exc_type, exc, tb):
+        await self.close()
+
+    async def close(self):
         pass
 
     def __init__(self, args):
@@ -706,10 +709,7 @@ async def process_markdown_images(
 
 
 class SubstackScraper(BaseSubstackScraper):
-    def __init__(self, args):
-        super().__init__(args, self.args.url)
-
-    def get_url_soup(self, url: str) -> Optional[BeautifulSoup]:
+    async def get_url_soup(self, url: str) -> Optional[BeautifulSoup]:
         """
         Gets soup from URL using requests
         """
@@ -764,9 +764,6 @@ async def _async_init(self):
     async def _start_driver(self):
         self.driver = await webdriver.Chrome(options=self.chrome_options)
 
-    async def __aexit__(self, exc_type, exc, tb):
-        await self.close()
-
     async def close(self) -> None:
         if self.driver:
             await self.driver.quit()

From ad84f46ddc3a9defc82e4a2d5f29b8c84534efb2 Mon Sep 17 00:00:00 2001
From: Milan Hauth <milahu@gmail.com>
Date: Tue, 30 Dec 2025 12:09:07 +0100
Subject: [PATCH 15/28] add parameter offline

---
 src/substack2markdown/substack_scraper.py | 81 +++++++++++++++++++++--
 1 file changed, 75 insertions(+), 6 deletions(-)

diff --git a/src/substack2markdown/substack_scraper.py b/src/substack2markdown/substack_scraper.py
index 835ca5bd..45cbea05 100644
--- a/src/substack2markdown/substack_scraper.py
+++ b/src/substack2markdown/substack_scraper.py
@@ -140,11 +140,32 @@ def get_all_post_urls(self) -> List[str]:
         """
         Attempts to fetch URLs from sitemap.xml, falling back to feed.xml if necessary.
         """
+        if self.args.offline:
+            return self.get_all_post_urls_offline()
         urls = self.fetch_urls_from_sitemap()
         if not urls:
             urls = self.fetch_urls_from_feed()
         return self.filter_urls(urls, self.keywords)
 
+    def get_all_post_urls_offline(self) -> List[str]:
+        # Read JSON data
+        # NOTE this assumes that $post_slug is not used in args.output_directory_format
+        # because post_slug is undefined at this point
+        output_directory = self.output_directory_template.substitute(self.format_vars)
+        self.format_vars["output_directory"] = output_directory
+        posts_json_path = os.path.join(
+            # self.format_vars["output_directory"] = 
+            self.format_vars["output_directory"],
+            self.posts_json_path_template.substitute(self.format_vars)
+        )
+        with open(posts_json_path, 'r', encoding='utf-8') as file:
+            posts_data = json.load(file)
+        urls = []
+        for post in posts_data:
+            post["slug"] = post["html_link"].split("/")[-2] # FIXME remove
+            urls.append(self.args.url + "p/" + post["slug"])
+        return urls
+
     def fetch_urls_from_sitemap(self) -> List[str]:
         """
         Fetches URLs from sitemap.xml.
@@ -352,6 +373,27 @@ def extract_post_data(self, soup: BeautifulSoup) -> Tuple[str, str, str, str, st
 
         return title, subtitle, like_count, date, md_content
 
+    def extract_post_data_from_preloads(self, post_preloads):
+
+        title = post_preloads["post"]["title"]
+
+        subtitle = post_preloads["post"]["description"]
+
+        like_count = post_preloads["post"]["reactions"]["❤"]
+
+        # TODO expose date format
+        datetime_format = "%b %d, %Y" # "Oct 01, 2025"
+
+        date = post_preloads["post"]["post_date"] # date in ISO format: "2025-10-01T14:43:48.389Z"
+        date = datetime.strptime(date, "%Y-%m-%dT%H:%M:%S.%fZ").strftime(datetime_format)
+
+        content_html = post_preloads["post"]["body_html"]
+        md = self.html_to_md(content_html)
+        # Combine metadata + content
+        md_content = self.combine_metadata_and_content(title, subtitle, date, like_count, md)
+
+        return title, subtitle, like_count, date, md_content
+
     async def get_window_preloads(self, soup):
         # all comments are stored in javascript
         # <script>window._preloads = JSON.parse("{\"isEU\":true,\"language\":\"en\",...}")</script>
@@ -470,7 +512,7 @@ def render_comments_html_inner(comment, buf):
         return buf.getvalue()
 
     @abstractmethod
-    def get_url_soup(self, url: str) -> str:
+    async def get_url_soup(self, url: str) -> str:
         raise NotImplementedError
 
     def save_posts_data_json(self, posts_data: list) -> None:
@@ -524,7 +566,15 @@ async def scrape_posts(self, num_posts_to_scrape: int = 0) -> None:
                 self.format_vars["html_directory"] = os.path.dirname(html_filepath)
 
                 # if not os.path.exists(md_filepath):
-                if True:
+                if self.args.offline:
+                    json_filepath = os.path.join(
+                        output_directory,
+                        self.post_json_path_template.substitute(self.format_vars)
+                    )
+                    with open(json_filepath) as f:
+                        post_preloads = json.load(f)
+                    title, subtitle, like_count, date, md = self.extract_post_data_from_preloads(post_preloads)
+                else:
                     soup = await self.get_url_soup(url)
                     if soup is None:
                         total += 1
@@ -532,20 +582,31 @@ async def scrape_posts(self, num_posts_to_scrape: int = 0) -> None:
                     title, subtitle, like_count, date, md = self.extract_post_data(soup)
                     post_preloads = await self.get_window_preloads(soup)
 
+                if True:
                     post_id = post_preloads["post"]["id"]
 
+                if True:
                     if not self.args.no_images:
                         total_images = count_images_in_markdown(md)
                         with tqdm(total=total_images, desc=f"Downloading images for {post_slug}", leave=False) as img_pbar:
                             md = await self.process_markdown_images(md, img_pbar)
 
+                if True:
                     comments_html = None
                     comments_num = None
                     if not self.args.no_comments:
                         comments_url = url + "/comments"
                         # comments_url = "https://willstorr.substack.com/p/scamming-substack/comments" # test
-                        comments_soup = await self.get_url_soup(comments_url)
-                        comments_preloads = await self.get_window_preloads(comments_soup)
+                        if self.args.offline:
+                            json_filepath = os.path.join(
+                                output_directory,
+                                self.comments_json_path_template.substitute(self.format_vars)
+                            )
+                            with open(json_filepath) as f:
+                                comments_preloads = json.load(f)
+                        else:
+                            comments_soup = await self.get_url_soup(comments_url)
+                            comments_preloads = await self.get_window_preloads(comments_soup)
                         if not self.args.no_json:
                             json_filepath = os.path.join(
                                 output_directory,
@@ -587,6 +648,7 @@ async def scrape_posts(self, num_posts_to_scrape: int = 0) -> None:
 
                     posts_data.append({
                         "id": post_id,
+                        "slug": post_preloads["post"]["slug"],
                         "title": title,
                         "subtitle": subtitle,
                         "like_count": like_count,
@@ -696,7 +758,7 @@ async def process_markdown_images(
                 output_directory,
                 self.image_path_template.substitute(format_vars)
             ))
-            if not save_path.exists():
+            if not save_path.exists() and not self.args.offline:
                 await self.download_image(url, save_path, pbar)
             md_directory = self.format_vars["md_directory"]
             rel_path = save_path
@@ -920,6 +982,11 @@ def parse_args() -> argparse.Namespace:
         default=0,
         help="The number of posts to scrape. If 0 or not provided, all posts will be scraped.",
     )
+    parser.add_argument(
+        "--offline", # args.offline
+        action="store_true",
+        help="Use existing JSON files to render Markdown and HTML files.",
+    )
     parser.add_argument(
         "-p",
         "--premium",
@@ -1039,7 +1106,9 @@ async def async_main():
         args.author_template = args.assets_dir + "/" + HTML_TEMPLATE
 
     if True:
-        if args.premium:
+        if args.offline:
+            scraper = await SubstackScraper(args)
+        elif args.premium:
             scraper = await PremiumSubstackScraper(args)
         else:
             scraper = await SubstackScraper(args)

From 94192ca5dbc6131b1ff8f258c5c5e6b8c9cb7c6e Mon Sep 17 00:00:00 2001
From: Milan Hauth <milahu@gmail.com>
Date: Tue, 30 Dec 2025 12:22:14 +0100
Subject: [PATCH 16/28] fix paths in posts.json

---
 src/substack2markdown/substack_scraper.py | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

diff --git a/src/substack2markdown/substack_scraper.py b/src/substack2markdown/substack_scraper.py
index 45cbea05..341e4cb6 100644
--- a/src/substack2markdown/substack_scraper.py
+++ b/src/substack2markdown/substack_scraper.py
@@ -149,8 +149,6 @@ def get_all_post_urls(self) -> List[str]:
 
     def get_all_post_urls_offline(self) -> List[str]:
         # Read JSON data
-        # NOTE this assumes that $post_slug is not used in args.output_directory_format
-        # because post_slug is undefined at this point
         output_directory = self.output_directory_template.substitute(self.format_vars)
         self.format_vars["output_directory"] = output_directory
         posts_json_path = os.path.join(
@@ -540,6 +538,15 @@ async def scrape_posts(self, num_posts_to_scrape: int = 0) -> None:
         """
         Iterates over all posts and saves them as markdown and html files
         """
+        output_directory = self.output_directory_template.substitute(self.format_vars)
+        self.format_vars["output_directory"] = output_directory
+
+        posts_json_path = os.path.join(
+            self.format_vars["output_directory"],
+            self.posts_json_path_template.substitute(self.format_vars)
+        )
+        posts_json_dir = os.path.dirname(posts_json_path)
+
         posts_data = []
         count = 0
         total = num_posts_to_scrape if num_posts_to_scrape != 0 else len(self.post_urls)
@@ -548,9 +555,6 @@ async def scrape_posts(self, num_posts_to_scrape: int = 0) -> None:
                 post_slug = url.split("/")[-1]
                 self.format_vars["post_slug"] = post_slug
 
-                output_directory = self.output_directory_template.substitute(self.format_vars)
-                self.format_vars["output_directory"] = output_directory
-
                 md_filepath = os.path.join(
                     output_directory,
                     self.md_path_template.substitute(self.format_vars)
@@ -654,8 +658,8 @@ async def scrape_posts(self, num_posts_to_scrape: int = 0) -> None:
                         "like_count": like_count,
                         "comment_count": comments_num,
                         "date": date,
-                        "file_link": md_filepath,
-                        "html_link": html_filepath
+                        "file_link": os.path.relpath(md_filepath, posts_json_dir),
+                        "html_link": os.path.relpath(html_filepath, posts_json_dir),
                     })
                 else:
                     print(f"File already exists: {md_filepath}")

From 6cccf75c955387be805f69324100d2e8a947cedd Mon Sep 17 00:00:00 2001
From: Milan Hauth <milahu@gmail.com>
Date: Tue, 30 Dec 2025 13:02:48 +0100
Subject: [PATCH 17/28] add repost_count to posts_data

---
 src/substack2markdown/substack_scraper.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/substack2markdown/substack_scraper.py b/src/substack2markdown/substack_scraper.py
index 341e4cb6..7f3fa9ca 100644
--- a/src/substack2markdown/substack_scraper.py
+++ b/src/substack2markdown/substack_scraper.py
@@ -657,6 +657,7 @@ async def scrape_posts(self, num_posts_to_scrape: int = 0) -> None:
                         "subtitle": subtitle,
                         "like_count": like_count,
                         "comment_count": comments_num,
+                        "repost_count": post_preloads["post"]["restacks"],
                         "date": date,
                         "file_link": os.path.relpath(md_filepath, posts_json_dir),
                         "html_link": os.path.relpath(html_filepath, posts_json_dir),

From d311b56756c102b0534eb8e94a3bae5b943321c6 Mon Sep 17 00:00:00 2001
From: Milan Hauth <milahu@gmail.com>
Date: Tue, 30 Dec 2025 13:19:25 +0100
Subject: [PATCH 18/28] add post_json to posts_data

---
 src/substack2markdown/substack_scraper.py | 22 ++++++++++++++++++++--
 1 file changed, 20 insertions(+), 2 deletions(-)

diff --git a/src/substack2markdown/substack_scraper.py b/src/substack2markdown/substack_scraper.py
index 7f3fa9ca..c23e5a75 100644
--- a/src/substack2markdown/substack_scraper.py
+++ b/src/substack2markdown/substack_scraper.py
@@ -569,6 +569,18 @@ async def scrape_posts(self, num_posts_to_scrape: int = 0) -> None:
                 self.format_vars["html_filepath"] = html_filepath
                 self.format_vars["html_directory"] = os.path.dirname(html_filepath)
 
+                post_json_filepath = None
+                comments_json_filepath = None
+                if not self.args.no_json:
+                    post_json_filepath = os.path.join(
+                        output_directory,
+                        self.post_json_path_template.substitute(self.format_vars)
+                    )
+                    comments_json_filepath = os.path.join(
+                        output_directory,
+                        self.comments_json_path_template.substitute(self.format_vars)
+                    )
+
                 # if not os.path.exists(md_filepath):
                 if self.args.offline:
                     json_filepath = os.path.join(
@@ -650,7 +662,7 @@ async def scrape_posts(self, num_posts_to_scrape: int = 0) -> None:
                     html_content = self.md_to_html(md)
                     self.save_to_html_file(html_filepath, html_content)
 
-                    posts_data.append({
+                    post = {
                         "id": post_id,
                         "slug": post_preloads["post"]["slug"],
                         "title": title,
@@ -661,7 +673,13 @@ async def scrape_posts(self, num_posts_to_scrape: int = 0) -> None:
                         "date": date,
                         "file_link": os.path.relpath(md_filepath, posts_json_dir),
                         "html_link": os.path.relpath(html_filepath, posts_json_dir),
-                    })
+                    }
+
+                    if not self.args.no_json:
+                        post["post_json"] = os.path.relpath(post_json_filepath, posts_json_dir)
+                        post["comments_json"] = os.path.relpath(comments_json_filepath, posts_json_dir)
+
+                    posts_data.append(post)
                 else:
                     print(f"File already exists: {md_filepath}")
             except Exception as e:

From d7ff58243ef64e41982f96b7fc2c20d0672ff9ca Mon Sep 17 00:00:00 2001
From: Milan Hauth <milahu@gmail.com>
Date: Tue, 30 Dec 2025 13:36:30 +0100
Subject: [PATCH 19/28] add generate_main_md_file

---
 src/substack2markdown/substack_scraper.py | 82 +++++++++++++++++++++++
 1 file changed, 82 insertions(+)

diff --git a/src/substack2markdown/substack_scraper.py b/src/substack2markdown/substack_scraper.py
index c23e5a75..64fed313 100644
--- a/src/substack2markdown/substack_scraper.py
+++ b/src/substack2markdown/substack_scraper.py
@@ -40,6 +40,7 @@
 DEFAULT_IMAGE_PATH_FORMAT = "p/$post_slug/images/$image_filename"
 DEFAULT_MD_PATH_FORMAT = "p/$post_slug/readme.md"
 DEFAULT_HTML_PATH_FORMAT = "p/$post_slug/index.html"
+DEFAULT_POSTS_MD_PATH_FORMAT = "readme.md"
 DEFAULT_POSTS_HTML_PATH_FORMAT = "index.html"
 DEFAULT_POSTS_JSON_PATH_FORMAT = "posts.json"
 DEFAULT_POST_JSON_PATH_FORMAT = "p/$post_slug/post.json"
@@ -119,6 +120,7 @@ def __init__(self, args):
         self.md_path_template = string.Template(self.args.md_path_format)
         self.html_path_template = string.Template(self.args.html_path_format)
         self.image_path_template = string.Template(self.args.image_path_format)
+        self.posts_md_path_template = string.Template(self.args.posts_md_path_format)
         self.posts_html_path_template = string.Template(self.args.posts_html_path_format)
         self.posts_json_path_template = string.Template(self.args.posts_json_path_format)
         self.post_json_path_template = string.Template(self.args.post_json_path_format)
@@ -689,8 +691,77 @@ async def scrape_posts(self, num_posts_to_scrape: int = 0) -> None:
             if num_posts_to_scrape != 0 and count == num_posts_to_scrape:
                 break
         self.save_posts_data_json(posts_data)
+        self.generate_main_md_file()
         self.generate_main_html_file()
 
+    def generate_main_md_file(self) -> None:
+        """
+        Generates a Markdown file for the given author.
+        """
+        # Read JSON data
+        posts_json_path = os.path.join(
+            self.format_vars["output_directory"],
+            self.posts_json_path_template.substitute(self.format_vars)
+        )
+        with open(posts_json_path, 'r', encoding='utf-8') as file:
+            posts_data = json.load(file)
+
+        # sort by post_id, descending
+        posts_data.sort(key=lambda p: -1*p["id"])
+
+        last_post = posts_data[0]
+        last_post_json_path = last_post["post_json"]
+        last_post_json_path = os.path.join(
+            os.path.dirname(posts_json_path),
+            last_post_json_path
+        )
+
+        with open(last_post_json_path, 'r', encoding='utf-8') as file:
+            last_post = json.load(file)
+
+        publication = last_post["pub"]
+
+        md_output_path = os.path.join(
+            self.format_vars["output_directory"],
+            self.posts_md_path_template.substitute(self.format_vars)
+        )
+
+        with open(md_output_path, 'w', encoding='utf-8') as file:
+            file.write(f'# {publication["name"]}\n')
+            file.write('\n')
+            # author_url = f'https://substack.com/@{publication["author_handle"]}' # variable
+            author_url = f'https://substack.com/profile/{publication["author_id"]}' # constant
+            file.write(f'by [{publication["author_name"]}]({author_url})\n')
+            file.write('\n')
+            author_bio = publication["author_bio"].replace("\n", "\n\n")
+            file.write(f'{author_bio}\n')
+            file.write('\n')
+            file.write('\n')
+            file.write('\n')
+            file.write('## Posts\n')
+            file.write('\n')
+            for post in posts_data:
+                # TODO use args.datetime_format
+                post_date = post["date"]
+                post_link = (
+                    '<a id="post' +
+                    str(post["id"]) +
+                    '" href="' +
+                    post["file_link"] +
+                    '" title="' +
+                    post["subtitle"].replace('"', '&quot;') +
+                    '">' +
+                    post["title"].replace('<', '&lt;') +
+                    '</a>'
+                )
+                if post["like_count"] > 0:
+                    post_link += f" ❤" + str(post["like_count"]) # "❤123"
+                if post["comment_count"] > 0:
+                    post_link += f" 🗨" + str(post["comment_count"]) # "🗨123"
+                if post["repost_count"] > 0:
+                    post_link += f" ↻" + str(post["repost_count"]) # "↻123"
+                file.write(f'- {post_date} - {post_link}\n')
+
     def generate_main_html_file(self) -> None:
         """
         Generates a HTML file for the given author.
@@ -706,6 +777,11 @@ def generate_main_html_file(self) -> None:
         # Convert JSON data to a JSON string for embedding
         embedded_json_data = json.dumps(posts_data, **json_dump_kwargs)
 
+        md_output_path = os.path.join(
+            self.format_vars["output_directory"],
+            self.posts_md_path_template.substitute(self.format_vars)
+        )
+
         html_output_path = os.path.join(
             self.format_vars["output_directory"],
             self.posts_html_path_template.substitute(self.format_vars)
@@ -1069,6 +1145,12 @@ def parse_args() -> argparse.Namespace:
         default=DEFAULT_IMAGE_PATH_FORMAT,
         help=f"The file path format to save scraped image files. Default: {DEFAULT_IMAGE_PATH_FORMAT!r}",
     )
+    parser.add_argument(
+        "--posts-md-path-format", # args.posts_md_path_format
+        type=str,
+        default=DEFAULT_POSTS_MD_PATH_FORMAT,
+        help=f"The file path format to save an index of scraped posts as Markdown file. Default: {DEFAULT_POSTS_MD_PATH_FORMAT!r}",
+    )
     parser.add_argument(
         "--posts-html-path-format", # args.posts_html_path_format
         type=str,

From cbf9cbefab76db8329ef66f6af0d48d683049bcd Mon Sep 17 00:00:00 2001
From: Milan Hauth <milahu@gmail.com>
Date: Tue, 30 Dec 2025 13:38:24 +0100
Subject: [PATCH 20/28] fix post links: remove ../ prefix

---
 src/substack2markdown/assets/js/populate-essays.js | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/substack2markdown/assets/js/populate-essays.js b/src/substack2markdown/assets/js/populate-essays.js
index 0c700383..4bca7708 100644
--- a/src/substack2markdown/assets/js/populate-essays.js
+++ b/src/substack2markdown/assets/js/populate-essays.js
@@ -19,7 +19,7 @@ function populateEssays(data) {
     const essaysContainer = document.getElementById('essays-container');
     const list = data.map(essay => `
         <li>
-            <a href="../${showHTML ? essay.html_link : essay.file_link}" target="_blank">${essay.title}</a>
+            <a href="${showHTML ? essay.html_link : essay.file_link}" target="_blank">${essay.title}</a>
             <div class="subtitle">${essay.subtitle}</div>
             <div class="metadata">${essay.like_count} Likes - ${essay.date}</div>
         </li>

From 8d24a803978309e025d52a814a3e7ef2b88f2718 Mon Sep 17 00:00:00 2001
From: Milan Hauth <milahu@gmail.com>
Date: Tue, 30 Dec 2025 13:40:44 +0100
Subject: [PATCH 21/28] fix post links: remove target="_blank"

---
 src/substack2markdown/assets/js/populate-essays.js | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/substack2markdown/assets/js/populate-essays.js b/src/substack2markdown/assets/js/populate-essays.js
index 4bca7708..4c2249e1 100644
--- a/src/substack2markdown/assets/js/populate-essays.js
+++ b/src/substack2markdown/assets/js/populate-essays.js
@@ -19,7 +19,7 @@ function populateEssays(data) {
     const essaysContainer = document.getElementById('essays-container');
     const list = data.map(essay => `
         <li>
-            <a href="${showHTML ? essay.html_link : essay.file_link}" target="_blank">${essay.title}</a>
+            <a href="${showHTML ? essay.html_link : essay.file_link}">${essay.title}</a>
             <div class="subtitle">${essay.subtitle}</div>
             <div class="metadata">${essay.like_count} Likes - ${essay.date}</div>
         </li>

From c138b87bcb6ee8b440a2c6ef8ca97bb3fd5b3b02 Mon Sep 17 00:00:00 2001
From: Milan Hauth <milahu@gmail.com>
Date: Tue, 30 Dec 2025 15:34:16 +0100
Subject: [PATCH 22/28] store ISO format date in posts.json

---
 src/substack2markdown/substack_scraper.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/substack2markdown/substack_scraper.py b/src/substack2markdown/substack_scraper.py
index 64fed313..18f6a79c 100644
--- a/src/substack2markdown/substack_scraper.py
+++ b/src/substack2markdown/substack_scraper.py
@@ -381,11 +381,10 @@ def extract_post_data_from_preloads(self, post_preloads):
 
         like_count = post_preloads["post"]["reactions"]["❤"]
 
-        # TODO expose date format
-        datetime_format = "%b %d, %Y" # "Oct 01, 2025"
-
         date = post_preloads["post"]["post_date"] # date in ISO format: "2025-10-01T14:43:48.389Z"
-        date = datetime.strptime(date, "%Y-%m-%dT%H:%M:%S.%fZ").strftime(datetime_format)
+
+        # datetime_format = "%b %d, %Y" # "Oct 01, 2025"
+        # date = datetime.strptime(date, "%Y-%m-%dT%H:%M:%S.%fZ").strftime(datetime_format)
 
         content_html = post_preloads["post"]["body_html"]
         md = self.html_to_md(content_html)
@@ -599,6 +598,7 @@ async def scrape_posts(self, num_posts_to_scrape: int = 0) -> None:
                         continue
                     title, subtitle, like_count, date, md = self.extract_post_data(soup)
                     post_preloads = await self.get_window_preloads(soup)
+                    date = post_preloads["post"]["post_date"] # date in ISO format: "2025-10-01T14:43:48.389Z"
 
                 if True:
                     post_id = post_preloads["post"]["id"]

From 2acf823b707c49816d433a0159a06fc418e79a47 Mon Sep 17 00:00:00 2001
From: Milan Hauth <milahu@gmail.com>
Date: Tue, 30 Dec 2025 15:55:09 +0100
Subject: [PATCH 23/28] fix regex pattern in process_markdown_images

---
 src/substack2markdown/substack_scraper.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/substack2markdown/substack_scraper.py b/src/substack2markdown/substack_scraper.py
index 18f6a79c..7954ddc6 100644
--- a/src/substack2markdown/substack_scraper.py
+++ b/src/substack2markdown/substack_scraper.py
@@ -842,12 +842,12 @@ async def process_markdown_images(
         """Process markdown content to download images and update references."""
         output_directory = self.format_vars["output_directory"]
         # [![](https://substackcdn.com/image/fetch/x.png)](https://substackcdn.com/image/fetch/x.png)
-        pattern = re.compile(r'\(https://substackcdn\.com/image/fetch/[^\s\)]+\)')
+        pattern = re.compile(r'\((https://substackcdn\.com/image/fetch/[^\s\)]+)\)')
         buf = io.StringIO()
         last_end = 0
         for match in pattern.finditer(md_content):
             buf.write(md_content[last_end:match.start()])
-            url = match.group(0).strip("()")
+            url = match.group(1)
             filename = sanitize_image_filename(url)
             format_vars = {
                 **self.format_vars,

From e5b0bdd738dcf88fe6a8598240dc1e92e55eefd7 Mon Sep 17 00:00:00 2001
From: Milan Hauth <milahu@gmail.com>
Date: Tue, 30 Dec 2025 18:00:23 +0100
Subject: [PATCH 24/28] download images: add resolve_image_url

---
 src/substack2markdown/substack_scraper.py | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/src/substack2markdown/substack_scraper.py b/src/substack2markdown/substack_scraper.py
index 7954ddc6..9e129131 100644
--- a/src/substack2markdown/substack_scraper.py
+++ b/src/substack2markdown/substack_scraper.py
@@ -83,6 +83,15 @@ def sanitize_image_filename(url: str) -> str:
     return filename
 
 
+def resolve_image_url(url: str) -> str:
+    """Get the original image URL."""
+    # https://substackcdn.com/image/fetch/xxx/https%3A%2F%2Fsubstack-post-media.s3.amazonaws.com%2Fpublic%2Fimages%2Fxxx
+    if url.startswith("https://substackcdn.com/image/fetch/"):
+        # substackcdn.com returns a compressed version of the original image
+        url = "https://" + unquote(url.split("/https%3A%2F%2F")[1])
+    return url
+
+
 def get_post_slug(url: str) -> str:
     match = re.search(r'/p/([^/]+)', url)
     return match.group(1) if match else 'unknown_post'
@@ -848,6 +857,7 @@ async def process_markdown_images(
         for match in pattern.finditer(md_content):
             buf.write(md_content[last_end:match.start()])
             url = match.group(1)
+            url = resolve_image_url(url)
             filename = sanitize_image_filename(url)
             format_vars = {
                 **self.format_vars,

From 0997609d1e67cefd1e9daf11014ced9ff07ca9bf Mon Sep 17 00:00:00 2001
From: Milan Hauth <milahu@gmail.com>
Date: Tue, 30 Dec 2025 18:09:40 +0100
Subject: [PATCH 25/28] fix type of like_count

---
 src/substack2markdown/substack_scraper.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/substack2markdown/substack_scraper.py b/src/substack2markdown/substack_scraper.py
index 9e129131..64a201d1 100644
--- a/src/substack2markdown/substack_scraper.py
+++ b/src/substack2markdown/substack_scraper.py
@@ -371,6 +371,7 @@ def extract_post_data(self, soup: BeautifulSoup) -> Tuple[str, str, str, str, st
             if like_count_element and like_count_element.text.strip().isdigit()
             else "0"
         )
+        like_count = int(like_count)
 
         # Post content
         content_element = soup.select_one("div.available-content")

From 4ecb5e25f7a8e6fccb59dcec356d1a7ade27de33 Mon Sep 17 00:00:00 2001
From: Milan Hauth <milahu@gmail.com>
Date: Wed, 31 Dec 2025 17:48:43 +0100
Subject: [PATCH 26/28] fix url loop in scrape_posts

---
 src/substack2markdown/substack_scraper.py | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/src/substack2markdown/substack_scraper.py b/src/substack2markdown/substack_scraper.py
index 64a201d1..adafad65 100644
--- a/src/substack2markdown/substack_scraper.py
+++ b/src/substack2markdown/substack_scraper.py
@@ -559,9 +559,8 @@ async def scrape_posts(self, num_posts_to_scrape: int = 0) -> None:
         posts_json_dir = os.path.dirname(posts_json_path)
 
         posts_data = []
-        count = 0
-        total = num_posts_to_scrape if num_posts_to_scrape != 0 else len(self.post_urls)
-        for url in tqdm(self.post_urls, total=total):
+        post_urls_slice = self.post_urls if num_posts_to_scrape == 0 else self.post_urls[:num_posts_to_scrape]
+        for url in tqdm(post_urls_slice):
             try:
                 post_slug = url.split("/")[-1]
                 self.format_vars["post_slug"] = post_slug
@@ -604,7 +603,6 @@ async def scrape_posts(self, num_posts_to_scrape: int = 0) -> None:
                 else:
                     soup = await self.get_url_soup(url)
                     if soup is None:
-                        total += 1
                         continue
                     title, subtitle, like_count, date, md = self.extract_post_data(soup)
                     post_preloads = await self.get_window_preloads(soup)
@@ -697,9 +695,6 @@ async def scrape_posts(self, num_posts_to_scrape: int = 0) -> None:
             except Exception as e:
                 print(f"Error scraping post: {e}")
                 # raise e # debug
-            count += 1
-            if num_posts_to_scrape != 0 and count == num_posts_to_scrape:
-                break
         self.save_posts_data_json(posts_data)
         self.generate_main_md_file()
         self.generate_main_html_file()

From e5b0f177172dca4bb1cca4f3689cdafbb3436840 Mon Sep 17 00:00:00 2001
From: Milan Hauth <milahu@gmail.com>
Date: Wed, 31 Dec 2025 18:09:53 +0100
Subject: [PATCH 27/28] remove parameter offline

---
 src/substack2markdown/substack_scraper.py | 21 ++++++++++++++++-----
 1 file changed, 16 insertions(+), 5 deletions(-)

diff --git a/src/substack2markdown/substack_scraper.py b/src/substack2markdown/substack_scraper.py
index adafad65..55f5299f 100644
--- a/src/substack2markdown/substack_scraper.py
+++ b/src/substack2markdown/substack_scraper.py
@@ -670,6 +670,10 @@ async def scrape_posts(self, num_posts_to_scrape: int = 0) -> None:
 
                     # Convert markdown to HTML and save
                     html_content = self.md_to_html(md)
+                    # if self.args.offline:
+                    #     html_content = post_preloads["post"]["body_html"]
+                    # else:
+                    #     html_content = self.md_to_html(md)
                     self.save_to_html_file(html_filepath, html_content)
 
                     post = {
@@ -1087,11 +1091,16 @@ def parse_args() -> argparse.Namespace:
         default=0,
         help="The number of posts to scrape. If 0 or not provided, all posts will be scraped.",
     )
-    parser.add_argument(
-        "--offline", # args.offline
-        action="store_true",
-        help="Use existing JSON files to render Markdown and HTML files.",
-    )
+    # this was based on the wrong assumption
+    # that post_preloads JSON data contains the same body_html as the HTML page, but
+    # post_preloads["post"]["body_html"] contains HTML components with "data-attrs" attributes
+    # str(soup.select_one("div.available-content")) is clean HTML
+    # TODO convert HTML components to clean HTML
+    # parser.add_argument(
+    #     "--offline", # args.offline
+    #     action="store_true",
+    #     help="Use existing JSON files to render Markdown and HTML files.",
+    # )
     parser.add_argument(
         "-p",
         "--premium",
@@ -1203,6 +1212,8 @@ def parse_args() -> argparse.Namespace:
 async def async_main():
     args = parse_args()
 
+    args.offline = False
+
     if args.config:
         with open(args.config) as f:
             config = json.load(f)

From 55b7e915d2f8c1e1262b692e334c2149eaad739d Mon Sep 17 00:00:00 2001
From: Milan Hauth <milahu@gmail.com>
Date: Wed, 31 Dec 2025 18:28:49 +0100
Subject: [PATCH 28/28] add process_markdown_links

---
 src/substack2markdown/substack_scraper.py | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/src/substack2markdown/substack_scraper.py b/src/substack2markdown/substack_scraper.py
index 55f5299f..0bc32838 100644
--- a/src/substack2markdown/substack_scraper.py
+++ b/src/substack2markdown/substack_scraper.py
@@ -617,6 +617,8 @@ async def scrape_posts(self, num_posts_to_scrape: int = 0) -> None:
                         with tqdm(total=total_images, desc=f"Downloading images for {post_slug}", leave=False) as img_pbar:
                             md = await self.process_markdown_images(md, img_pbar)
 
+                md = self.process_markdown_links(md)
+
                 if True:
                     comments_html = None
                     comments_num = None
@@ -878,6 +880,25 @@ async def process_markdown_images(
         buf.write(md_content[last_end:])
         return buf.getvalue()
 
+    def process_markdown_links(self, md_content):
+        # patch links to other posts of this publication
+        pattern = re.compile(r'\]\(https://' + self.publication_handle + r'\.substack\.com/p/([^\s\)]+)\)')
+        md_directory = self.format_vars["md_directory"]
+        output_directory = self.format_vars["output_directory"]
+        def get_replacement(match):
+            post_slug = match.group(1)
+            md_filepath = os.path.join(
+                output_directory,
+                self.md_path_template.substitute({
+                    **self.format_vars,
+                    "post_slug": post_slug,
+                })
+            )
+            md_filepath_rel = os.path.relpath(md_filepath, md_directory)
+            return '](' + md_filepath_rel + ')'
+        md_content = re.sub(pattern, get_replacement, md_content)
+        return md_content
+
 
 class SubstackScraper(BaseSubstackScraper):
     async def get_url_soup(self, url: str) -> Optional[BeautifulSoup]: