From 3287e59a34b158aa07d4b30df418f3ced60f32b7 Mon Sep 17 00:00:00 2001 From: gupsammy Date: Sat, 29 Nov 2025 13:58:52 +0530 Subject: [PATCH 1/3] refactor(extraction): improve image preservation with text-overlap matching - Add calculateTextOverlap() to match Readability output fingerprints against CMS containers - Refactor findBestContentContainer() to use hybrid approach: text overlap + image count tiebreaker - Improves container detection on multi-column and gallery-heavy pages by better distinguishing article containers from sidebars - Add includeImages option to extractContent() and skip fallback when disabled for performance - Update content-script.js to pass includeImages setting through extraction pipeline --- content-script.js | 4 +- lib/ezycopy.js | 163 +++++++++++++++++++++++++++++++++------------- 2 files changed, 120 insertions(+), 47 deletions(-) diff --git a/content-script.js b/content-script.js index 6fc3702..1bed26c 100644 --- a/content-script.js +++ b/content-script.js @@ -55,8 +55,8 @@ async function downloadMarkdownFile(content, filename) { const { copyToClipboard: shouldCopy, downloadMarkdown: shouldDownload, includeImages } = settings; const { selectiveCopy, downloadImagesLocally } = settings.experimental || {}; - // Extract content (respects selectiveCopy setting) - const extraction = extractContent({ selectiveCopy }); + // Extract content (respects selectiveCopy and includeImages settings) + const extraction = extractContent({ selectiveCopy, includeImages }); let clipboardContent = null; let downloadContent = null; diff --git a/lib/ezycopy.js b/lib/ezycopy.js index ee8044b..465ae73 100644 --- a/lib/ezycopy.js +++ b/lib/ezycopy.js @@ -65,6 +65,34 @@ function countImagesInHtml(html) { return doc.querySelectorAll('img').length; } +/** + * Calculate text overlap between Readability output and a container + * Used to identify which CMS container matches the extracted article content + * @param {string} readabilityText - Text content from Readability + * @param {string} containerText - Text content from a CMS container + * @returns {number} Overlap ratio between 0 and 1 + */ +function calculateTextOverlap(readabilityText, containerText) { + if (!readabilityText || readabilityText.length === 0) return 0; + + // Normalize: lowercase, collapse whitespace + const normalize = (str) => str.toLowerCase().replace(/\s+/g, ' ').trim(); + const normalizedReadability = normalize(readabilityText); + const normalizedContainer = normalize(containerText); + + // Check if one contains the other (handles Readability stripping or container having extra) + if (normalizedContainer.includes(normalizedReadability)) return 1.0; + if (normalizedReadability.includes(normalizedContainer)) { + return normalizedContainer.length / normalizedReadability.length; + } + + // Word-based overlap for partial matches (filter words > 3 chars to avoid noise) + const words = normalizedReadability.split(' ').filter(w => w.length > 3); + if (words.length === 0) return 0; + const matchedWords = words.filter(w => normalizedContainer.includes(w)); + return matchedWords.length / words.length; +} + /** * Clean HTML by removing non-content elements (styles, scripts, promos) * @param {Element} container - The container element to clean @@ -97,30 +125,52 @@ function cleanContainerHtml(container) { } /** - * Find the best content container for fallback extraction - * Tries common CMS content selectors before falling back to body - * @returns {Element} The content container element + * Find the best content container for fallback extraction using hybrid approach + * Uses text overlap with Readability output as primary signal, image count as tiebreaker + * @param {string} readabilityText - Text content extracted by Readability (used as "fingerprint") + * @returns {Element|null} The best matching container, or null if none found */ -function findContentContainer() { - // Only use CMS-specific content containers that reliably contain ONLY article content - // Avoid generic elements like 'main', 'article' which often include sidebars/related content +function findBestContentContainer(readabilityText) { + // CMS-specific selectors only - avoid generic main/article which include sidebars const selectors = [ - '.rte', // Shopify - '.entry-content', // WordPress - '.post-content', // WordPress alt - '.article-content', // Common CMS pattern - '.post-body', // Blogger/Tumblr - '.story-body', // News sites - '.article-body', // News sites alt + '.rte', // Shopify + '.entry-content', // WordPress + '.post-content', // WordPress alt + '.article-content', // Common CMS + '.post-body', // Blogger/Tumblr + '.story-body', // News sites + '.article-body', // News sites alt ]; - for (const selector of selectors) { - const el = document.querySelector(selector); - if (el && el.querySelectorAll('img').length > 0) { - return el; - } + // Collect ALL matching containers across all selectors + const candidates = []; + selectors.forEach(selector => { + document.querySelectorAll(selector).forEach(el => { + if (el.querySelectorAll('img').length > 0) { + candidates.push({ + element: el, + selector, + textOverlap: calculateTextOverlap(readabilityText, el.textContent), + imageCount: el.querySelectorAll('img').length + }); + } + }); + }); + + if (candidates.length === 0) return null; + + // Filter to candidates with >50% text overlap + const goodMatches = candidates.filter(c => c.textOverlap > 0.5); + + if (goodMatches.length === 0) { + // No good text match - fall back to container with most images + candidates.sort((a, b) => b.imageCount - a.imageCount); + return candidates[0].element; } - return document.body; + + // Among good text matches, pick the one with most images (tiebreaker) + goodMatches.sort((a, b) => b.imageCount - a.imageCount); + return goodMatches[0].element; } /** @@ -178,10 +228,11 @@ function formatContent(extraction, target, settings) { * Main extraction function - returns structured data for formatting * @param {Object} options - Extraction options * @param {boolean} options.selectiveCopy - Whether to check for text selection + * @param {boolean} options.includeImages - Whether images are enabled (skip fallback if false) * @returns {Object} Structured extraction result */ function extractContent(options = {}) { - const { selectiveCopy = true } = options; + const { selectiveCopy = true, includeImages = true } = options; const turndown = createTurndownService(); // Check for text selection (only if selectiveCopy enabled) @@ -216,33 +267,55 @@ function extractContent(options = {}) { }; } - // Check for significant image loss from Readability processing - // Readability strips images when text-to-image ratio is low (gallery pages) - // Only fallback if we find a specific content container (not body) - const container = findContentContainer(); - const isSpecificContainer = container !== document.body; - - if (isSpecificContainer) { - const containerImageCount = container.querySelectorAll('img').length; - const readabilityImageCount = countImagesInHtml(article.content); - const imageLossRatio = containerImageCount > 0 - ? (containerImageCount - readabilityImageCount) / containerImageCount - : 0; - - // If >50% images lost from content container, use fallback to preserve images - if (imageLossRatio > 0.5) { - const cleanedHtml = cleanContainerHtml(container); - return { - title: article.title, - body: turndown.turndown(cleanedHtml), - sourceUrl: location.href, - byline: article.byline, - isSelection: false, - html: cleanedHtml - }; - } + // EARLY RETURN: Skip fallback logic if images are disabled (optimization) + if (!includeImages) { + return { + title: article.title, + body: turndown.turndown(article.content), + sourceUrl: location.href, + byline: article.byline, + isSelection: false, + html: article.content + }; + } + + // Find best container using text matching + image count (hybrid approach) + const readabilityText = article.textContent || ''; + const container = findBestContentContainer(readabilityText); + + // If no CMS container found, use Readability output + if (!container) { + return { + title: article.title, + body: turndown.turndown(article.content), + sourceUrl: location.href, + byline: article.byline, + isSelection: false, + html: article.content + }; + } + + // Check image loss in selected container + const containerImageCount = container.querySelectorAll('img').length; + const readabilityImageCount = countImagesInHtml(article.content); + const imageLossRatio = containerImageCount > 0 + ? (containerImageCount - readabilityImageCount) / containerImageCount + : 0; + + // If >50% images lost, use cleaned container to preserve images + if (imageLossRatio > 0.5) { + const cleanedHtml = cleanContainerHtml(container); + return { + title: article.title, + body: turndown.turndown(cleanedHtml), + sourceUrl: location.href, + byline: article.byline, + isSelection: false, + html: cleanedHtml + }; } + // Default: use Readability output return { title: article.title, body: turndown.turndown(article.content), From fe9a588a42f83a6d7426057ac865148b28945da0 Mon Sep 17 00:00:00 2001 From: gupsammy Date: Sat, 29 Nov 2025 19:13:52 +0530 Subject: [PATCH 2/3] refactor: optimize image handling and simplify module architecture Consolidate image-related functions and eliminate unnecessary module layers: - Replace separate extractImagesFromHtml() and countImagesInHtml() with unified analyzeImagesInHtml() for single-pass extraction - Move image analysis into extractContent() to pre-populate images in extraction result (eliminates redundant parsing) - Unify generateFilename() and generateSubfolder() into single generateBaseName() function - Remove redundant platform.js wrapper layer, use EzyCopyFiles directly in content-script.js - Update content-script.js to consume pre-analyzed images from extraction result instead of extracting separately - Remove generatePageSubfolder() from file-helpers.js (replaced by generateBaseName) - Remove lib/platform.js from injection list - Update documentation to reflect simplified architecture - Fix popup.js toggle synchronization to handle both checkboxes This reduces code complexity, eliminates redundant DOM parsing, and streamlines the data flow. --- CLAUDE.md | 10 ++-- content-script.js | 40 +++++++------- file-helpers.js | 11 ---- injection-files.js | 1 - lib/ezycopy.js | 129 +++++++++++++++++++-------------------------- lib/platform.js | 36 ------------- popup.js | 6 ++- 7 files changed, 83 insertions(+), 150 deletions(-) delete mode 100644 lib/platform.js diff --git a/CLAUDE.md b/CLAUDE.md index 1dd5ed3..cf8a4a9 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -21,16 +21,16 @@ icons/ → Extension icons (16, 48, 128px) ## Architecture **Two Trigger Methods:** -- Popup button → `popup.js` injects libs + calls `extractContent()` + `generateFilename()` +- Popup button → `popup.js` injects libs + calls `extractContent()` + `generateBaseName()` - Right-click menu → `background.js` injects libs + `content-script.js` **Data Flow:** 1. Inject `lib/*.js` scripts into page context -2. `extractContent()` uses Readability to extract article content +2. `extractContent()` uses Readability to extract article content (includes pre-analyzed images) 3. Turndown converts HTML to Markdown (with GFM tables/strikethrough) -4. `generateFilename()` creates safe filename from title + date -5. Images embedded as `![alt](url)` - no local download -6. Markdown saved via `window.showSaveFilePicker()` +4. `generateBaseName()` creates safe base name from title + date +5. Images embedded as `![alt](url)` or downloaded locally (experimental setting) +6. Markdown saved via Chrome downloads API to `Downloads/EzyCopy/` ## Key Libraries diff --git a/content-script.js b/content-script.js index 1bed26c..b1737af 100644 --- a/content-script.js +++ b/content-script.js @@ -1,6 +1,7 @@ -// This script is injected after the lib scripts (readability, turndown, ezycopy, platform) -// extractContent(), formatContent(), generateFilename(), generateSubfolder(), -// extractImagesFromHtml(), and rewriteImagePaths() are available from lib/ +// This script is injected after the lib scripts (readability, turndown, ezycopy) +// extractContent(), formatContent(), generateBaseName() are global functions from lib/ezycopy.js +// extraction.images contains pre-analyzed image data (no need for separate extraction call) +// EzyCopyFiles.rewriteImagePaths() is available from file-helpers.js /** * Build success message based on actions performed @@ -72,27 +73,26 @@ async function downloadMarkdownFile(content, filename) { downloadContent = formatContent(extraction, 'download', settings); // Handle image downloads if all conditions met - if (includeImages && downloadImagesLocally && extraction.html) { - const images = extractImagesFromHtml(extraction.html); - if (images.length > 0) { - showFeedback(`Downloading ${images.length} images...`, "#2196F3"); - - const subfolder = generateSubfolder(extraction.title); - const imageResult = await chrome.runtime.sendMessage({ - action: 'downloadImages', - images: images, - subfolder: subfolder - }); - - if (imageResult.downloadedCount > 0) { - downloadContent = rewriteImagePaths(downloadContent, imageResult.urlToPathMap); - imageCount = imageResult.downloadedCount; - } + // Images are pre-analyzed in extraction result (single-pass optimization) + if (includeImages && downloadImagesLocally && extraction.images?.length > 0) { + const images = extraction.images; + showFeedback(`Downloading ${images.length} images...`, "#2196F3"); + + const subfolder = generateBaseName(extraction.title); + const imageResult = await chrome.runtime.sendMessage({ + action: 'downloadImages', + images: images, + subfolder: subfolder + }); + + if (imageResult.downloadedCount > 0) { + downloadContent = EzyCopyFiles.rewriteImagePaths(downloadContent, imageResult.urlToPathMap); + imageCount = imageResult.downloadedCount; } } } - const filename = generateFilename(extraction.title); + const filename = generateBaseName(extraction.title) + '.md'; // Execute outputs let copiedToClipboard = false; diff --git a/file-helpers.js b/file-helpers.js index 6dfde8f..0de9b10 100644 --- a/file-helpers.js +++ b/file-helpers.js @@ -43,16 +43,6 @@ } } - function generatePageSubfolder(pageTitle) { - const safeTitle = pageTitle - .substring(0, 50) - .replace(/[^a-zA-Z0-9]/g, '-') - .replace(/-+/g, '-') - .replace(/^-|-$/g, ''); - const timestamp = new Date().toISOString().slice(0, 10); - return `${safeTitle}-${timestamp}`; - } - function rewriteImagePaths(markdown, urlToPathMap) { let result = markdown; @@ -71,7 +61,6 @@ getBasePath, getImagesPath, sanitizeImageFilename, - generatePageSubfolder, rewriteImagePaths, }; })(typeof self !== 'undefined' ? self : this); diff --git a/injection-files.js b/injection-files.js index 0537d75..b7c9e42 100644 --- a/injection-files.js +++ b/injection-files.js @@ -8,7 +8,6 @@ "lib/turndown-plugin-gfm.js", "file-helpers.js", "lib/ezycopy.js", - "lib/platform.js", "content-script.js", ]; diff --git a/lib/ezycopy.js b/lib/ezycopy.js index 465ae73..892c0c0 100644 --- a/lib/ezycopy.js +++ b/lib/ezycopy.js @@ -55,14 +55,40 @@ function stripImages(markdown) { } /** - * Count images in an HTML string + * Analyze images in an HTML string - returns both count and image data + * Single-pass extraction to avoid redundant HTML parsing * @param {string} html - HTML content to parse - * @returns {number} Number of img elements found + * @returns {{count: number, images: Array<{src: string, alt: string}>}} Image analysis result */ -function countImagesInHtml(html) { +function analyzeImagesInHtml(html) { const parser = new DOMParser(); const doc = parser.parseFromString(html, 'text/html'); - return doc.querySelectorAll('img').length; + const imgElements = doc.querySelectorAll('img'); + + const images = []; + const seenUrls = new Set(); + + imgElements.forEach((img, index) => { + const src = img.getAttribute('src') || img.getAttribute('data-src') || img.getAttribute('data-lazy-src'); + if (!src || src.startsWith('data:') || seenUrls.has(src)) return; + + seenUrls.add(src); + + // Resolve relative URLs to absolute + let absoluteSrc = src; + try { + absoluteSrc = new URL(src, location.href).href; + } catch (e) { + // Keep original if URL parsing fails + } + + images.push({ + src: absoluteSrc, + alt: img.alt || `image-${index}` + }); + }); + + return { count: imgElements.length, images }; } /** @@ -239,13 +265,15 @@ function extractContent(options = {}) { if (selectiveCopy) { const selectionHtml = getSelectionHtml(); if (selectionHtml) { + const { images } = analyzeImagesInHtml(selectionHtml); return { title: document.title, body: turndown.turndown(selectionHtml), sourceUrl: location.href, byline: null, isSelection: true, - html: selectionHtml + html: selectionHtml, + images }; } } @@ -257,13 +285,16 @@ function extractContent(options = {}) { if (!article) { // Fallback: convert body directly if Readability can't parse + const bodyHtml = document.body.outerHTML; + const { images } = analyzeImagesInHtml(bodyHtml); return { title: document.title, body: turndown.turndown(document.body), sourceUrl: location.href, byline: null, isSelection: false, - html: document.body.outerHTML + html: bodyHtml, + images }; } @@ -275,7 +306,8 @@ function extractContent(options = {}) { sourceUrl: location.href, byline: article.byline, isSelection: false, - html: article.content + html: article.content, + images: [] // Skip image analysis when images are disabled }; } @@ -285,68 +317,59 @@ function extractContent(options = {}) { // If no CMS container found, use Readability output if (!container) { + const { images } = analyzeImagesInHtml(article.content); return { title: article.title, body: turndown.turndown(article.content), sourceUrl: location.href, byline: article.byline, isSelection: false, - html: article.content + html: article.content, + images }; } // Check image loss in selected container const containerImageCount = container.querySelectorAll('img').length; - const readabilityImageCount = countImagesInHtml(article.content); + const readabilityAnalysis = analyzeImagesInHtml(article.content); const imageLossRatio = containerImageCount > 0 - ? (containerImageCount - readabilityImageCount) / containerImageCount + ? (containerImageCount - readabilityAnalysis.count) / containerImageCount : 0; // If >50% images lost, use cleaned container to preserve images if (imageLossRatio > 0.5) { const cleanedHtml = cleanContainerHtml(container); + const { images } = analyzeImagesInHtml(cleanedHtml); return { title: article.title, body: turndown.turndown(cleanedHtml), sourceUrl: location.href, byline: article.byline, isSelection: false, - html: cleanedHtml + html: cleanedHtml, + images }; } - // Default: use Readability output + // Default: use Readability output (reuse already-analyzed images) return { title: article.title, body: turndown.turndown(article.content), sourceUrl: location.href, byline: article.byline, isSelection: false, - html: article.content + html: article.content, + images: readabilityAnalysis.images }; } /** - * Generate a safe filename from page title - * @param {string} title - Page or article title - * @returns {string} Safe filename with .md extension - */ -function generateFilename(title) { - const safeTitle = title - .substring(0, 50) - .replace(/[^a-zA-Z0-9]/g, '-') - .replace(/-+/g, '-') - .replace(/^-|-$/g, ''); - const timestamp = new Date().toISOString().slice(0, 10); - return `${safeTitle}-${timestamp}.md`; -} - -/** - * Generate subfolder name for images + * Generate a safe base name from page title (without extension) + * Used for both markdown filenames and image subfolders * @param {string} title - Page or article title - * @returns {string} Safe subfolder name + * @returns {string} Safe base name in format "title-YYYY-MM-DD" */ -function generateSubfolder(title) { +function generateBaseName(title) { const safeTitle = title .substring(0, 50) .replace(/[^a-zA-Z0-9]/g, '-') @@ -356,48 +379,4 @@ function generateSubfolder(title) { return `${safeTitle}-${timestamp}`; } -/** - * Extract all images from the article content - * Uses DOMParser for safe HTML parsing (no script execution) - * @param {string} articleHtml - HTML content from Readability - * @returns {Array<{src: string, alt: string}>} Array of image objects - */ -function extractImagesFromHtml(articleHtml) { - const images = []; - const seenUrls = new Set(); - - // DOMParser is safe - it doesn't execute scripts or load external resources - const parser = new DOMParser(); - const doc = parser.parseFromString(articleHtml, 'text/html'); - - const imgElements = doc.querySelectorAll('img'); - - imgElements.forEach((img, index) => { - const src = img.getAttribute('src') || img.getAttribute('data-src') || img.getAttribute('data-lazy-src'); - - if (!src) return; - - // Skip data URIs (already embedded) - if (src.startsWith('data:')) return; - - // Skip duplicates - if (seenUrls.has(src)) return; - seenUrls.add(src); - - // Resolve relative URLs to absolute - let absoluteSrc = src; - try { - absoluteSrc = new URL(src, location.href).href; - } catch (e) { - // Keep original if URL parsing fails - } - - images.push({ - src: absoluteSrc, - alt: img.alt || `image-${index}` - }); - }); - - return images; -} diff --git a/lib/platform.js b/lib/platform.js deleted file mode 100644 index 1b5a0ed..0000000 --- a/lib/platform.js +++ /dev/null @@ -1,36 +0,0 @@ -(function () { - // Prevent redeclaration when scripts are injected multiple times - if (window.EzyCopyPlatform) return; - - // Expect shared helpers to be loaded first - if (!window.EzyCopyFiles) { - console.error('EzyCopyFiles not loaded; platform helpers unavailable'); - return; - } - - const { - EZYCOPY_FOLDER, - IMAGES_SUBFOLDER, - getBasePath: getEzyCopyBasePath, - getImagesPath, - sanitizeImageFilename, - generatePageSubfolder, - rewriteImagePaths, - } = window.EzyCopyFiles; - - // Expose API once to avoid polluting global scope with redeclarable consts - window.EzyCopyPlatform = { - EZYCOPY_FOLDER, - IMAGES_SUBFOLDER, - getEzyCopyBasePath, - getImagesPath, - sanitizeImageFilename, - generatePageSubfolder, - rewriteImagePaths, - }; - - // Backwards compatibility for existing global calls - window.rewriteImagePaths = rewriteImagePaths; - window.generatePageSubfolder = generatePageSubfolder; - window.sanitizeImageFilename = sanitizeImageFilename; -})(); diff --git a/popup.js b/popup.js index 9e424cb..52bdedf 100644 --- a/popup.js +++ b/popup.js @@ -49,7 +49,8 @@ document.addEventListener("DOMContentLoaded", async function () { currentSettings.copyToClipboard = e.target.checked; currentSettings = enforceAtLeastOneActive(currentSettings, 'copyToClipboard'); await saveSettings(currentSettings); - // Sync UI if enforcement changed the other toggle + // Sync both toggles in case enforcement changed either + copyToClipboardToggle.checked = currentSettings.copyToClipboard; downloadMarkdownToggle.checked = currentSettings.downloadMarkdown; updateDownloadImagesVisibility(currentSettings.downloadMarkdown, currentSettings.includeImages); }); @@ -60,8 +61,9 @@ document.addEventListener("DOMContentLoaded", async function () { currentSettings.downloadMarkdown = e.target.checked; currentSettings = enforceAtLeastOneActive(currentSettings, 'downloadMarkdown'); await saveSettings(currentSettings); - // Sync UI if enforcement changed the other toggle + // Sync both toggles in case enforcement changed either copyToClipboardToggle.checked = currentSettings.copyToClipboard; + downloadMarkdownToggle.checked = currentSettings.downloadMarkdown; updateDownloadImagesVisibility(currentSettings.downloadMarkdown, currentSettings.includeImages); }); From 2df13b5d83002c2930032a0e848dda37166b416c Mon Sep 17 00:00:00 2001 From: Samarth Gupta Date: Sat, 29 Nov 2025 20:54:02 +0530 Subject: [PATCH 3/3] Update lib/ezycopy.js Co-authored-by: claude[bot] <209825114+claude[bot]@users.noreply.github.com> --- lib/ezycopy.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/ezycopy.js b/lib/ezycopy.js index 892c0c0..9cdadc4 100644 --- a/lib/ezycopy.js +++ b/lib/ezycopy.js @@ -88,7 +88,7 @@ function analyzeImagesInHtml(html) { }); }); - return { count: imgElements.length, images }; + return { count: images.length, images }; } /**