diff --git a/CLAUDE.md b/CLAUDE.md index 1dd5ed3..cf8a4a9 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -21,16 +21,16 @@ icons/ → Extension icons (16, 48, 128px) ## Architecture **Two Trigger Methods:** -- Popup button → `popup.js` injects libs + calls `extractContent()` + `generateFilename()` +- Popup button → `popup.js` injects libs + calls `extractContent()` + `generateBaseName()` - Right-click menu → `background.js` injects libs + `content-script.js` **Data Flow:** 1. Inject `lib/*.js` scripts into page context -2. `extractContent()` uses Readability to extract article content +2. `extractContent()` uses Readability to extract article content (includes pre-analyzed images) 3. Turndown converts HTML to Markdown (with GFM tables/strikethrough) -4. `generateFilename()` creates safe filename from title + date -5. Images embedded as `![alt](url)` - no local download -6. Markdown saved via `window.showSaveFilePicker()` +4. `generateBaseName()` creates safe base name from title + date +5. Images embedded as `![alt](url)` or downloaded locally (experimental setting) +6. Markdown saved via Chrome downloads API to `Downloads/EzyCopy/` ## Key Libraries diff --git a/content-script.js b/content-script.js index 6fc3702..b1737af 100644 --- a/content-script.js +++ b/content-script.js @@ -1,6 +1,7 @@ -// This script is injected after the lib scripts (readability, turndown, ezycopy, platform) -// extractContent(), formatContent(), generateFilename(), generateSubfolder(), -// extractImagesFromHtml(), and rewriteImagePaths() are available from lib/ +// This script is injected after the lib scripts (readability, turndown, ezycopy) +// extractContent(), formatContent(), generateBaseName() are global functions from lib/ezycopy.js +// extraction.images contains pre-analyzed image data (no need for separate extraction call) +// EzyCopyFiles.rewriteImagePaths() is available from file-helpers.js /** * Build success message based on actions performed @@ -55,8 +56,8 @@ async function downloadMarkdownFile(content, filename) { const { copyToClipboard: shouldCopy, downloadMarkdown: shouldDownload, includeImages } = settings; const { selectiveCopy, downloadImagesLocally } = settings.experimental || {}; - // Extract content (respects selectiveCopy setting) - const extraction = extractContent({ selectiveCopy }); + // Extract content (respects selectiveCopy and includeImages settings) + const extraction = extractContent({ selectiveCopy, includeImages }); let clipboardContent = null; let downloadContent = null; @@ -72,27 +73,26 @@ async function downloadMarkdownFile(content, filename) { downloadContent = formatContent(extraction, 'download', settings); // Handle image downloads if all conditions met - if (includeImages && downloadImagesLocally && extraction.html) { - const images = extractImagesFromHtml(extraction.html); - if (images.length > 0) { - showFeedback(`Downloading ${images.length} images...`, "#2196F3"); - - const subfolder = generateSubfolder(extraction.title); - const imageResult = await chrome.runtime.sendMessage({ - action: 'downloadImages', - images: images, - subfolder: subfolder - }); - - if (imageResult.downloadedCount > 0) { - downloadContent = rewriteImagePaths(downloadContent, imageResult.urlToPathMap); - imageCount = imageResult.downloadedCount; - } + // Images are pre-analyzed in extraction result (single-pass optimization) + if (includeImages && downloadImagesLocally && extraction.images?.length > 0) { + const images = extraction.images; + showFeedback(`Downloading ${images.length} images...`, "#2196F3"); + + const subfolder = generateBaseName(extraction.title); + const imageResult = await chrome.runtime.sendMessage({ + action: 'downloadImages', + images: images, + subfolder: subfolder + }); + + if (imageResult.downloadedCount > 0) { + downloadContent = EzyCopyFiles.rewriteImagePaths(downloadContent, imageResult.urlToPathMap); + imageCount = imageResult.downloadedCount; } } } - const filename = generateFilename(extraction.title); + const filename = generateBaseName(extraction.title) + '.md'; // Execute outputs let copiedToClipboard = false; diff --git a/file-helpers.js b/file-helpers.js index 6dfde8f..0de9b10 100644 --- a/file-helpers.js +++ b/file-helpers.js @@ -43,16 +43,6 @@ } } - function generatePageSubfolder(pageTitle) { - const safeTitle = pageTitle - .substring(0, 50) - .replace(/[^a-zA-Z0-9]/g, '-') - .replace(/-+/g, '-') - .replace(/^-|-$/g, ''); - const timestamp = new Date().toISOString().slice(0, 10); - return `${safeTitle}-${timestamp}`; - } - function rewriteImagePaths(markdown, urlToPathMap) { let result = markdown; @@ -71,7 +61,6 @@ getBasePath, getImagesPath, sanitizeImageFilename, - generatePageSubfolder, rewriteImagePaths, }; })(typeof self !== 'undefined' ? self : this); diff --git a/injection-files.js b/injection-files.js index 0537d75..b7c9e42 100644 --- a/injection-files.js +++ b/injection-files.js @@ -8,7 +8,6 @@ "lib/turndown-plugin-gfm.js", "file-helpers.js", "lib/ezycopy.js", - "lib/platform.js", "content-script.js", ]; diff --git a/lib/ezycopy.js b/lib/ezycopy.js index ee8044b..9cdadc4 100644 --- a/lib/ezycopy.js +++ b/lib/ezycopy.js @@ -55,14 +55,68 @@ function stripImages(markdown) { } /** - * Count images in an HTML string + * Analyze images in an HTML string - returns both count and image data + * Single-pass extraction to avoid redundant HTML parsing * @param {string} html - HTML content to parse - * @returns {number} Number of img elements found + * @returns {{count: number, images: Array<{src: string, alt: string}>}} Image analysis result */ -function countImagesInHtml(html) { +function analyzeImagesInHtml(html) { const parser = new DOMParser(); const doc = parser.parseFromString(html, 'text/html'); - return doc.querySelectorAll('img').length; + const imgElements = doc.querySelectorAll('img'); + + const images = []; + const seenUrls = new Set(); + + imgElements.forEach((img, index) => { + const src = img.getAttribute('src') || img.getAttribute('data-src') || img.getAttribute('data-lazy-src'); + if (!src || src.startsWith('data:') || seenUrls.has(src)) return; + + seenUrls.add(src); + + // Resolve relative URLs to absolute + let absoluteSrc = src; + try { + absoluteSrc = new URL(src, location.href).href; + } catch (e) { + // Keep original if URL parsing fails + } + + images.push({ + src: absoluteSrc, + alt: img.alt || `image-${index}` + }); + }); + + return { count: images.length, images }; +} + +/** + * Calculate text overlap between Readability output and a container + * Used to identify which CMS container matches the extracted article content + * @param {string} readabilityText - Text content from Readability + * @param {string} containerText - Text content from a CMS container + * @returns {number} Overlap ratio between 0 and 1 + */ +function calculateTextOverlap(readabilityText, containerText) { + if (!readabilityText || readabilityText.length === 0) return 0; + + // Normalize: lowercase, collapse whitespace + const normalize = (str) => str.toLowerCase().replace(/\s+/g, ' ').trim(); + const normalizedReadability = normalize(readabilityText); + const normalizedContainer = normalize(containerText); + + // Check if one contains the other (handles Readability stripping or container having extra) + if (normalizedContainer.includes(normalizedReadability)) return 1.0; + if (normalizedReadability.includes(normalizedContainer)) { + return normalizedContainer.length / normalizedReadability.length; + } + + // Word-based overlap for partial matches (filter words > 3 chars to avoid noise) + const words = normalizedReadability.split(' ').filter(w => w.length > 3); + if (words.length === 0) return 0; + const matchedWords = words.filter(w => normalizedContainer.includes(w)); + return matchedWords.length / words.length; } /** @@ -97,30 +151,52 @@ function cleanContainerHtml(container) { } /** - * Find the best content container for fallback extraction - * Tries common CMS content selectors before falling back to body - * @returns {Element} The content container element + * Find the best content container for fallback extraction using hybrid approach + * Uses text overlap with Readability output as primary signal, image count as tiebreaker + * @param {string} readabilityText - Text content extracted by Readability (used as "fingerprint") + * @returns {Element|null} The best matching container, or null if none found */ -function findContentContainer() { - // Only use CMS-specific content containers that reliably contain ONLY article content - // Avoid generic elements like 'main', 'article' which often include sidebars/related content +function findBestContentContainer(readabilityText) { + // CMS-specific selectors only - avoid generic main/article which include sidebars const selectors = [ - '.rte', // Shopify - '.entry-content', // WordPress - '.post-content', // WordPress alt - '.article-content', // Common CMS pattern - '.post-body', // Blogger/Tumblr - '.story-body', // News sites - '.article-body', // News sites alt + '.rte', // Shopify + '.entry-content', // WordPress + '.post-content', // WordPress alt + '.article-content', // Common CMS + '.post-body', // Blogger/Tumblr + '.story-body', // News sites + '.article-body', // News sites alt ]; - for (const selector of selectors) { - const el = document.querySelector(selector); - if (el && el.querySelectorAll('img').length > 0) { - return el; - } + // Collect ALL matching containers across all selectors + const candidates = []; + selectors.forEach(selector => { + document.querySelectorAll(selector).forEach(el => { + if (el.querySelectorAll('img').length > 0) { + candidates.push({ + element: el, + selector, + textOverlap: calculateTextOverlap(readabilityText, el.textContent), + imageCount: el.querySelectorAll('img').length + }); + } + }); + }); + + if (candidates.length === 0) return null; + + // Filter to candidates with >50% text overlap + const goodMatches = candidates.filter(c => c.textOverlap > 0.5); + + if (goodMatches.length === 0) { + // No good text match - fall back to container with most images + candidates.sort((a, b) => b.imageCount - a.imageCount); + return candidates[0].element; } - return document.body; + + // Among good text matches, pick the one with most images (tiebreaker) + goodMatches.sort((a, b) => b.imageCount - a.imageCount); + return goodMatches[0].element; } /** @@ -178,23 +254,26 @@ function formatContent(extraction, target, settings) { * Main extraction function - returns structured data for formatting * @param {Object} options - Extraction options * @param {boolean} options.selectiveCopy - Whether to check for text selection + * @param {boolean} options.includeImages - Whether images are enabled (skip fallback if false) * @returns {Object} Structured extraction result */ function extractContent(options = {}) { - const { selectiveCopy = true } = options; + const { selectiveCopy = true, includeImages = true } = options; const turndown = createTurndownService(); // Check for text selection (only if selectiveCopy enabled) if (selectiveCopy) { const selectionHtml = getSelectionHtml(); if (selectionHtml) { + const { images } = analyzeImagesInHtml(selectionHtml); return { title: document.title, body: turndown.turndown(selectionHtml), sourceUrl: location.href, byline: null, isSelection: true, - html: selectionHtml + html: selectionHtml, + images }; } } @@ -206,74 +285,91 @@ function extractContent(options = {}) { if (!article) { // Fallback: convert body directly if Readability can't parse + const bodyHtml = document.body.outerHTML; + const { images } = analyzeImagesInHtml(bodyHtml); return { title: document.title, body: turndown.turndown(document.body), sourceUrl: location.href, byline: null, isSelection: false, - html: document.body.outerHTML + html: bodyHtml, + images }; } - // Check for significant image loss from Readability processing - // Readability strips images when text-to-image ratio is low (gallery pages) - // Only fallback if we find a specific content container (not body) - const container = findContentContainer(); - const isSpecificContainer = container !== document.body; - - if (isSpecificContainer) { - const containerImageCount = container.querySelectorAll('img').length; - const readabilityImageCount = countImagesInHtml(article.content); - const imageLossRatio = containerImageCount > 0 - ? (containerImageCount - readabilityImageCount) / containerImageCount - : 0; - - // If >50% images lost from content container, use fallback to preserve images - if (imageLossRatio > 0.5) { - const cleanedHtml = cleanContainerHtml(container); - return { - title: article.title, - body: turndown.turndown(cleanedHtml), - sourceUrl: location.href, - byline: article.byline, - isSelection: false, - html: cleanedHtml - }; - } + // EARLY RETURN: Skip fallback logic if images are disabled (optimization) + if (!includeImages) { + return { + title: article.title, + body: turndown.turndown(article.content), + sourceUrl: location.href, + byline: article.byline, + isSelection: false, + html: article.content, + images: [] // Skip image analysis when images are disabled + }; } + // Find best container using text matching + image count (hybrid approach) + const readabilityText = article.textContent || ''; + const container = findBestContentContainer(readabilityText); + + // If no CMS container found, use Readability output + if (!container) { + const { images } = analyzeImagesInHtml(article.content); + return { + title: article.title, + body: turndown.turndown(article.content), + sourceUrl: location.href, + byline: article.byline, + isSelection: false, + html: article.content, + images + }; + } + + // Check image loss in selected container + const containerImageCount = container.querySelectorAll('img').length; + const readabilityAnalysis = analyzeImagesInHtml(article.content); + const imageLossRatio = containerImageCount > 0 + ? (containerImageCount - readabilityAnalysis.count) / containerImageCount + : 0; + + // If >50% images lost, use cleaned container to preserve images + if (imageLossRatio > 0.5) { + const cleanedHtml = cleanContainerHtml(container); + const { images } = analyzeImagesInHtml(cleanedHtml); + return { + title: article.title, + body: turndown.turndown(cleanedHtml), + sourceUrl: location.href, + byline: article.byline, + isSelection: false, + html: cleanedHtml, + images + }; + } + + // Default: use Readability output (reuse already-analyzed images) return { title: article.title, body: turndown.turndown(article.content), sourceUrl: location.href, byline: article.byline, isSelection: false, - html: article.content + html: article.content, + images: readabilityAnalysis.images }; } /** - * Generate a safe filename from page title + * Generate a safe base name from page title (without extension) + * Used for both markdown filenames and image subfolders * @param {string} title - Page or article title - * @returns {string} Safe filename with .md extension + * @returns {string} Safe base name in format "title-YYYY-MM-DD" */ -function generateFilename(title) { - const safeTitle = title - .substring(0, 50) - .replace(/[^a-zA-Z0-9]/g, '-') - .replace(/-+/g, '-') - .replace(/^-|-$/g, ''); - const timestamp = new Date().toISOString().slice(0, 10); - return `${safeTitle}-${timestamp}.md`; -} - -/** - * Generate subfolder name for images - * @param {string} title - Page or article title - * @returns {string} Safe subfolder name - */ -function generateSubfolder(title) { +function generateBaseName(title) { const safeTitle = title .substring(0, 50) .replace(/[^a-zA-Z0-9]/g, '-') @@ -283,48 +379,4 @@ function generateSubfolder(title) { return `${safeTitle}-${timestamp}`; } -/** - * Extract all images from the article content - * Uses DOMParser for safe HTML parsing (no script execution) - * @param {string} articleHtml - HTML content from Readability - * @returns {Array<{src: string, alt: string}>} Array of image objects - */ -function extractImagesFromHtml(articleHtml) { - const images = []; - const seenUrls = new Set(); - - // DOMParser is safe - it doesn't execute scripts or load external resources - const parser = new DOMParser(); - const doc = parser.parseFromString(articleHtml, 'text/html'); - - const imgElements = doc.querySelectorAll('img'); - - imgElements.forEach((img, index) => { - const src = img.getAttribute('src') || img.getAttribute('data-src') || img.getAttribute('data-lazy-src'); - - if (!src) return; - - // Skip data URIs (already embedded) - if (src.startsWith('data:')) return; - - // Skip duplicates - if (seenUrls.has(src)) return; - seenUrls.add(src); - - // Resolve relative URLs to absolute - let absoluteSrc = src; - try { - absoluteSrc = new URL(src, location.href).href; - } catch (e) { - // Keep original if URL parsing fails - } - - images.push({ - src: absoluteSrc, - alt: img.alt || `image-${index}` - }); - }); - - return images; -} diff --git a/lib/platform.js b/lib/platform.js deleted file mode 100644 index 1b5a0ed..0000000 --- a/lib/platform.js +++ /dev/null @@ -1,36 +0,0 @@ -(function () { - // Prevent redeclaration when scripts are injected multiple times - if (window.EzyCopyPlatform) return; - - // Expect shared helpers to be loaded first - if (!window.EzyCopyFiles) { - console.error('EzyCopyFiles not loaded; platform helpers unavailable'); - return; - } - - const { - EZYCOPY_FOLDER, - IMAGES_SUBFOLDER, - getBasePath: getEzyCopyBasePath, - getImagesPath, - sanitizeImageFilename, - generatePageSubfolder, - rewriteImagePaths, - } = window.EzyCopyFiles; - - // Expose API once to avoid polluting global scope with redeclarable consts - window.EzyCopyPlatform = { - EZYCOPY_FOLDER, - IMAGES_SUBFOLDER, - getEzyCopyBasePath, - getImagesPath, - sanitizeImageFilename, - generatePageSubfolder, - rewriteImagePaths, - }; - - // Backwards compatibility for existing global calls - window.rewriteImagePaths = rewriteImagePaths; - window.generatePageSubfolder = generatePageSubfolder; - window.sanitizeImageFilename = sanitizeImageFilename; -})(); diff --git a/popup.js b/popup.js index 9e424cb..52bdedf 100644 --- a/popup.js +++ b/popup.js @@ -49,7 +49,8 @@ document.addEventListener("DOMContentLoaded", async function () { currentSettings.copyToClipboard = e.target.checked; currentSettings = enforceAtLeastOneActive(currentSettings, 'copyToClipboard'); await saveSettings(currentSettings); - // Sync UI if enforcement changed the other toggle + // Sync both toggles in case enforcement changed either + copyToClipboardToggle.checked = currentSettings.copyToClipboard; downloadMarkdownToggle.checked = currentSettings.downloadMarkdown; updateDownloadImagesVisibility(currentSettings.downloadMarkdown, currentSettings.includeImages); }); @@ -60,8 +61,9 @@ document.addEventListener("DOMContentLoaded", async function () { currentSettings.downloadMarkdown = e.target.checked; currentSettings = enforceAtLeastOneActive(currentSettings, 'downloadMarkdown'); await saveSettings(currentSettings); - // Sync UI if enforcement changed the other toggle + // Sync both toggles in case enforcement changed either copyToClipboardToggle.checked = currentSettings.copyToClipboard; + downloadMarkdownToggle.checked = currentSettings.downloadMarkdown; updateDownloadImagesVisibility(currentSettings.downloadMarkdown, currentSettings.includeImages); });