From e86c55e8ad8f7246b13ee9d19ff857d054ac54b2 Mon Sep 17 00:00:00 2001 From: zhiyuan Date: Wed, 7 Jan 2026 17:11:12 +0800 Subject: [PATCH 1/3] fix: improved browser management in the crawler --- blocklets/snap-kit/api/src/routes/index.ts | 2 + packages/crawler/src/crawler.ts | 31 ++- packages/crawler/src/metrics.ts | 2 + packages/crawler/src/puppeteer.ts | 247 ++++++++++----------- 4 files changed, 148 insertions(+), 134 deletions(-) diff --git a/blocklets/snap-kit/api/src/routes/index.ts b/blocklets/snap-kit/api/src/routes/index.ts index 481798c..9b3f8ea 100644 --- a/blocklets/snap-kit/api/src/routes/index.ts +++ b/blocklets/snap-kit/api/src/routes/index.ts @@ -19,6 +19,7 @@ const crawlSchema = Joi.object({ cookies: Joi.array().items(Joi.object({ name: Joi.string().required(), value: Joi.string().required() })), localStorage: Joi.array().items(Joi.object({ key: Joi.string().required(), value: Joi.string().required() })), sync: Joi.boolean().default(false), + ignoreRobots: Joi.boolean().default(true), }); router.post('/crawl', session({ accessKey: true }), auth({ methods: ['accessKey'] }), async (req, res) => { const params = await crawlSchema.validateAsync(req.body); @@ -98,6 +99,7 @@ const snapSchema = Joi.object({ cookies: Joi.array().items(Joi.object({ name: Joi.string().required(), value: Joi.string().required() })), localStorage: Joi.array().items(Joi.object({ key: Joi.string().required(), value: Joi.string().required() })), sync: Joi.boolean().default(false), + ignoreRobots: Joi.boolean().default(true), }); router.post('/snap', session({ accessKey: true }), auth({ methods: ['accessKey'] }), async (req, res) => { const params = await snapSchema.validateAsync(req.body); diff --git a/packages/crawler/src/crawler.ts b/packages/crawler/src/crawler.ts index 893d3f8..90230ba 100644 --- a/packages/crawler/src/crawler.ts +++ b/packages/crawler/src/crawler.ts @@ -8,7 +8,7 @@ import path from 'path'; import { config, logger } from './config'; import { jobDurationSeconds, jobTotalLatencySeconds, jobsEnqueuedTotal, jobsTotal } from './metrics'; -import { initPage } from './puppeteer'; +import { closeBrowser, initPage, isBrowserConnectionError } from './puppeteer'; import { createCarbonImage } from './services/carbon'; import { convertJobToSnapshot, deleteSnapshots, formatSnapshot } from './services/snapshot'; import { Job, JobState, Snapshot, SnapshotModel, sequelize } from './store'; @@ -45,6 +45,7 @@ export function createCrawlQueue(queue: string, handler?: PageHandler) { options: { concurrency: config.concurrency, enableScheduledJob: true, + maxRetries: 3, }, onJob: async (job: JobState) => { const startTime = Date.now(); @@ -260,6 +261,21 @@ export const getPageContent = async ( let screenshot: Uint8Array | null = null; const meta: { title?: string; description?: string } = {}; + const closePageSafely = async () => { + try { + await page.close(); + } catch (error) { + if (isBrowserConnectionError(error)) { + try { + await closeBrowser({ trimCache: false }); + } catch (closeError) { + logger.warn('Failed to close browser after page close error', { error: closeError }); + } + } + logger.warn('Failed to close page:', { error }); + } + }; + try { const response = await page.goto(url, { timeout }); @@ -359,11 +375,20 @@ export const getPageContent = async ( logger.error('Failed to get html:', err); throw err; } + + await closePageSafely(); } catch (error) { + if (isBrowserConnectionError(error)) { + try { + await closeBrowser({ trimCache: false }); + } catch (closeError) { + logger.warn('Failed to close browser after page error', { error: closeError }); + } + } else { + await closePageSafely(); + } logger.error('Failed to get page content:', error); throw error; - } finally { - await page.close(); } return { diff --git a/packages/crawler/src/metrics.ts b/packages/crawler/src/metrics.ts index 8b6f52a..8c86b8b 100644 --- a/packages/crawler/src/metrics.ts +++ b/packages/crawler/src/metrics.ts @@ -54,6 +54,8 @@ export async function collectMetrics() { try { // 收集队列大小 const jobStats = await Job.stats(); + // Reset first to clear queues that no longer have jobs + queueSize.reset(); jobStats.queues.forEach((q) => { queueSize.set({ queue: q.queue }, q.count); }); diff --git a/packages/crawler/src/puppeteer.ts b/packages/crawler/src/puppeteer.ts index 867215a..31e9622 100644 --- a/packages/crawler/src/puppeteer.ts +++ b/packages/crawler/src/puppeteer.ts @@ -1,24 +1,21 @@ -import puppeteer, { Browser, Page, ResourceType } from '@blocklet/puppeteer'; +import puppeteer, { Browser, ResourceType } from '@blocklet/puppeteer'; import fs from 'fs-extra'; import path from 'path'; -import { clearInterval, setInterval } from 'timers'; import { config, logger } from './config'; -import { Job } from './store'; import { CRAWLER_FLAG, sleep } from './utils'; -const BrowserStatus = { - None: 'None', - Launching: 'Launching', - Ready: 'Ready', -}; -let browserStatus = BrowserStatus.None; - -/** Chromium WebSocket endpoint that allows puppeteer browser instance to connect to the browser */ -let browserEndpoint = ''; - let browser: Browser | null; -let browserActivatedTimer: NodeJS.Timeout | null; +let browserInitInFlight: Promise | null; +let closingBrowser: Promise | null; + +const BROWSER_CONNECTION_ERROR_PATTERNS = [ + /protocol error/i, + /target closed/i, + /browser disconnected/i, + /session closed/i, + /target crashed/i, +]; export { puppeteer }; @@ -72,7 +69,7 @@ export async function ensureBrowser() { // try to launch browser if (config.isProd) { - const browser = await launchBrowser(); + const browser = await getBrowser(); if (!browser) { throw new Error('Failed to launch browser'); } @@ -82,35 +79,7 @@ export async function ensureBrowser() { logger.info('Puppeteer is ready'); } -export async function connectBrowser() { - if (!browserEndpoint) { - return null; - } - - // retry if browser is launching - if (browserStatus === BrowserStatus.Launching) { - await sleep(Math.floor(Math.random() * 1000)); - return connectBrowser(); - } - - try { - browser = await puppeteer.connect({ - browserWSEndpoint: browserEndpoint, - }); - logger.info('Connect browser success'); - } catch (err) { - logger.warn('Connect browser failed, clear endpoint', err); - browserEndpoint = ''; - return null; - } - - return browser; -} - export async function launchBrowser() { - browserEndpoint = ''; - browserStatus = BrowserStatus.Launching; - try { browser = await puppeteer.launch({ headless: true, @@ -142,137 +111,153 @@ export async function launchBrowser() { '--disable-gpu-sandbox', ], }); + attachBrowserListeners(browser); logger.info('Launch browser'); } catch (error) { logger.error('launch browser failed: ', error); - browserStatus = BrowserStatus.None; - browserEndpoint = ''; throw error; } - // save browserWSEndpoint to cache - browserEndpoint = await browser!.wsEndpoint(); - browserStatus = BrowserStatus.Ready; - return browser; } -function checkBrowserActivated() { - clearBrowserActivatedTimer(); - - let count = 0; - - browserActivatedTimer = setInterval(async () => { - if (browser) { - const pages = await browser.pages().catch(() => [] as Page[]); - const jobCount = await Job.count().catch(() => 0); - - // Check if browser is inactive: only blank page AND no pending jobs - const isInactive = pages.length === 1 && pages[0]?.url() === 'about:blank' && jobCount === 0; +function resetBrowserState(reason?: string) { + if (reason) { + logger.warn('Reset browser state', { reason }); + } + browser = null; + browserInitInFlight = null; +} - if (isInactive) { - count++; - logger.debug(`Browser inactive count: ${count}/3`); - } else { - count = 0; - if (jobCount > 0) { - logger.debug(`Browser has ${jobCount} pending jobs, keeping active`); - } - } +export function isBrowserConnectionError(error: unknown) { + const message = error instanceof Error ? error.message : String(error || ''); + return BROWSER_CONNECTION_ERROR_PATTERNS.some((pattern) => pattern.test(message)); +} - if (count >= 3) { - logger.info('Browser inactive for 3 minutes, closing...'); - await closeBrowser({ - trimCache: true, - }); - } +function attachBrowserListeners(target: Browser) { + target.on('disconnected', () => { + if (browser !== target) { + return; } - }, 1000 * 60); -} -function clearBrowserActivatedTimer() { - if (browserActivatedTimer) { - clearInterval(browserActivatedTimer); - browserActivatedTimer = null; - } + logger.warn('Browser disconnected'); + resetBrowserState('disconnected'); + }); } -export const getBrowser = async () => { - if (browser) return browser; - +async function initBrowser() { // sleep random time (0 ~ 5s),to avoid concurrent blocklet await sleep(Math.floor(Math.random() * 1000 * 5)); - // try to connect browser - const connectedBrowser = await connectBrowser(); - if (connectedBrowser) { - logger.debug('getBrowser.connectedBrowser'); - browser = connectedBrowser; - checkBrowserActivated(); - return browser; - } - - // try to launch browser const launchedBrowser = await launchBrowser(); if (launchedBrowser) { logger.debug('getBrowser.launchedBrowser'); browser = launchedBrowser; - checkBrowserActivated(); return browser; } throw new Error('No browser to use, should install redis or browser'); -}; +} -export const closeBrowser = async ({ trimCache = true }: { trimCache?: boolean } = {}) => { - if (!browser) return; +export const getBrowser = async () => { + // Wait for any ongoing browser close operation to complete + if (closingBrowser) { + await closingBrowser; + } - // close all pages - try { - const pages = await browser.pages(); - await Promise.all(pages.map((page) => page.close())); - } catch (err) { - logger.warn('Failed to close all pages:', err); + if (browser) { + if (browser.isConnected()) { + return browser; + } + logger.warn('Browser instance is disconnected, resetting'); + resetBrowserState('disconnected'); } - // close browser - try { - await browser.close(); - } catch (err) { - logger.warn('Failed to close browser:', err); + if (browserInitInFlight) { + return browserInitInFlight; } - // clear cache - try { - if (trimCache) { - await puppeteer.trimCache(); - logger.debug('Trim cache success'); - } + const initPromise = initBrowser(); - // try to clear temporary directory - // if (puppeteerConfig) { - // await fs.emptyDir(puppeteerConfig.temporaryDirectory); - // } + browserInitInFlight = initPromise; - if (global.gc) { - global.gc(); + return initPromise.finally(() => { + if (browserInitInFlight === initPromise) { + browserInitInFlight = null; } - } catch (err) { - logger.warn('Failed to clear browser cache:', err); + }); +}; + +export const closeBrowser = ({ trimCache = true }: { trimCache?: boolean } = {}) => { + // Return existing close operation if already in progress + if (closingBrowser) { + return closingBrowser; } + if (!browser) return; + + const target = browser; browser = null; + browserInitInFlight = null; + + const doClose = async () => { + // close all pages + try { + const pages = await target.pages(); + await Promise.all(pages.map((page) => page.close().catch(() => {}))); + } catch (err) { + logger.warn('Failed to close all pages:', err); + } - clearBrowserActivatedTimer(); - browserEndpoint = ''; - browserStatus = BrowserStatus.None; + // close browser + try { + await target.close(); + } catch (err) { + logger.warn('Failed to close browser:', err); + } + + // clear cache + try { + if (trimCache) { + await puppeteer.trimCache(); + logger.debug('Trim cache success'); + } + + if (global.gc) { + global.gc(); + } + } catch (err) { + logger.warn('Failed to clear browser cache:', err); + } + + logger.info('Close browser success'); + }; - logger.info('Close browser success'); + closingBrowser = doClose().finally(() => { + closingBrowser = null; + }); + + return closingBrowser; }; export async function initPage({ abortResourceTypes = [] }: { abortResourceTypes?: ResourceType[] } = {}) { - const browser = await getBrowser(); - const page = await browser.newPage(); + const currentBrowser = await getBrowser(); + + let page; + try { + page = await currentBrowser.newPage(); + } catch (error) { + // If newPage fails due to connection error, close browser and retry once + if (isBrowserConnectionError(error)) { + logger.warn('Failed to create new page due to connection error, restarting browser'); + await closeBrowser({ trimCache: false }); + const newBrowser = await getBrowser(); + page = await newBrowser.newPage(); + } else { + throw error; + } + } + await page.setViewport({ width: 1440, height: 900 }); // page setting From ea0e8bad64ee00493da1965a97e152c9de7e54eb Mon Sep 17 00:00:00 2001 From: zhiyuan Date: Wed, 7 Jan 2026 17:31:55 +0800 Subject: [PATCH 2/3] fix: enhance snapshot handling and file deletion logic --- packages/crawler/src/services/snapshot.ts | 35 +++++++++++++++++++---- 1 file changed, 29 insertions(+), 6 deletions(-) diff --git a/packages/crawler/src/services/snapshot.ts b/packages/crawler/src/services/snapshot.ts index 94f41a9..b4be72c 100644 --- a/packages/crawler/src/services/snapshot.ts +++ b/packages/crawler/src/services/snapshot.ts @@ -39,12 +39,20 @@ export async function formatSnapshot(snapshot: SnapshotModel, columns?: Array, { txn const jobIds = await Promise.all( snapshots.map(async (snapshot) => { try { + // Check reference count before deleting files + // Only delete file if no other snapshots reference it + const deleteFilePromises: Promise[] = []; + + if (snapshot.html) { + const htmlRefCount = await Snapshot.count({ where: { html: snapshot.html } }); + if (htmlRefCount <= 1) { + deleteFilePromises.push(fs.unlink(path.join(config.dataDir, snapshot.html)).catch(() => {})); + } + } + + if (snapshot.screenshot) { + const screenshotRefCount = await Snapshot.count({ where: { screenshot: snapshot.screenshot } }); + if (screenshotRefCount <= 1) { + deleteFilePromises.push(fs.unlink(path.join(config.dataDir, snapshot.screenshot)).catch(() => {})); + } + } + try { - await Promise.all([ - snapshot.html && fs.unlink(path.join(config.dataDir, snapshot.html)), - snapshot.screenshot && fs.unlink(path.join(config.dataDir, snapshot.screenshot)), - ]); + await Promise.all(deleteFilePromises); } catch (err) { - logger.error('Failed to delete snapshot', { err, snapshot, dataDir: config.dataDir }); + logger.error('Failed to delete snapshot files', { err, snapshot, dataDir: config.dataDir }); } await snapshot.destroy({ transaction: txn }); From 2b2ac21e07206d5a224149e28e4fb7f1bb308772 Mon Sep 17 00:00:00 2001 From: zhiyuan Date: Wed, 7 Jan 2026 17:39:43 +0800 Subject: [PATCH 3/3] chore: 1.5.3 --- CHANGELOG.md | 6 ++++++ blocklets/snap-kit/blocklet.yml | 2 +- package.json | 4 ++-- packages/crawler/package.json | 2 +- packages/middleware/package.json | 2 +- version | 2 +- 6 files changed, 12 insertions(+), 6 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 4655e9a..6314316 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,9 @@ +## 1.5.3 (2026-1-7) + +- fix: enhance snapshot handling and file deletion logic +- fix: improved browser management in the crawler +- fix: add ignoreRobots params for skip robots detect + ## 1.5.2 (2026-1-5) - chore: update deps diff --git a/blocklets/snap-kit/blocklet.yml b/blocklets/snap-kit/blocklet.yml index c593774..7ce74ea 100644 --- a/blocklets/snap-kit/blocklet.yml +++ b/blocklets/snap-kit/blocklet.yml @@ -16,7 +16,7 @@ repository: type: git url: git+https://github.com/blocklet/create-blocklet.git specVersion: 1.2.8 -version: 1.5.2 +version: 1.5.3 logo: logo.png files: - dist diff --git a/package.json b/package.json index 2b4a16b..c884801 100644 --- a/package.json +++ b/package.json @@ -1,7 +1,7 @@ { "name": "crawler", "private": true, - "version": "1.5.2", + "version": "1.5.3", "scripts": { "dev": "pnpm run --filter @arcblock/crawler dev & pnpm run --filter @arcblock/crawler-middleware dev & pnpm run --filter @blocklet/snap-kit dev", "build:packages": "pnpm -r build", @@ -61,4 +61,4 @@ "simple-git-hooks": { "pre-commit": "npx lint-staged" } -} \ No newline at end of file +} diff --git a/packages/crawler/package.json b/packages/crawler/package.json index 8716622..634f8f4 100644 --- a/packages/crawler/package.json +++ b/packages/crawler/package.json @@ -1,6 +1,6 @@ { "name": "@arcblock/crawler", - "version": "1.5.2", + "version": "1.5.3", "main": "lib/cjs/index.js", "module": "lib/esm/index.js", "types": "lib/cjs/index.d.ts", diff --git a/packages/middleware/package.json b/packages/middleware/package.json index 31c3de6..ea9032c 100644 --- a/packages/middleware/package.json +++ b/packages/middleware/package.json @@ -1,6 +1,6 @@ { "name": "@arcblock/crawler-middleware", - "version": "1.5.2", + "version": "1.5.3", "main": "lib/cjs/index.js", "module": "lib/esm/index.js", "types": "lib/cjs/index.d.ts", diff --git a/version b/version index a73b432..1d5e9e0 100644 --- a/version +++ b/version @@ -1 +1 @@ -1.5.2 \ No newline at end of file +1.5.3 \ No newline at end of file