Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,9 @@
## 1.5.3 (2026-1-7)

- fix: enhance snapshot handling and file deletion logic
- fix: improved browser management in the crawler
- fix: add ignoreRobots params for skip robots detect

## 1.5.2 (2026-1-5)

- chore: update deps
Expand Down
2 changes: 2 additions & 0 deletions blocklets/snap-kit/api/src/routes/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ const crawlSchema = Joi.object({
cookies: Joi.array().items(Joi.object({ name: Joi.string().required(), value: Joi.string().required() })),
localStorage: Joi.array().items(Joi.object({ key: Joi.string().required(), value: Joi.string().required() })),
sync: Joi.boolean().default(false),
ignoreRobots: Joi.boolean().default(true),
});
router.post('/crawl', session({ accessKey: true }), auth({ methods: ['accessKey'] }), async (req, res) => {
const params = await crawlSchema.validateAsync(req.body);
Expand Down Expand Up @@ -98,6 +99,7 @@ const snapSchema = Joi.object({
cookies: Joi.array().items(Joi.object({ name: Joi.string().required(), value: Joi.string().required() })),
localStorage: Joi.array().items(Joi.object({ key: Joi.string().required(), value: Joi.string().required() })),
sync: Joi.boolean().default(false),
ignoreRobots: Joi.boolean().default(true),
});
router.post('/snap', session({ accessKey: true }), auth({ methods: ['accessKey'] }), async (req, res) => {
const params = await snapSchema.validateAsync(req.body);
Expand Down
2 changes: 1 addition & 1 deletion blocklets/snap-kit/blocklet.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ repository:
type: git
url: git+https://github.com/blocklet/create-blocklet.git
specVersion: 1.2.8
version: 1.5.2
version: 1.5.3
logo: logo.png
files:
- dist
Expand Down
4 changes: 2 additions & 2 deletions package.json
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
{
"name": "crawler",
"private": true,
"version": "1.5.2",
"version": "1.5.3",
"scripts": {
"dev": "pnpm run --filter @arcblock/crawler dev & pnpm run --filter @arcblock/crawler-middleware dev & pnpm run --filter @blocklet/snap-kit dev",
"build:packages": "pnpm -r build",
Expand Down Expand Up @@ -61,4 +61,4 @@
"simple-git-hooks": {
"pre-commit": "npx lint-staged"
}
}
}
2 changes: 1 addition & 1 deletion packages/crawler/package.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"name": "@arcblock/crawler",
"version": "1.5.2",
"version": "1.5.3",
"main": "lib/cjs/index.js",
"module": "lib/esm/index.js",
"types": "lib/cjs/index.d.ts",
Expand Down
31 changes: 28 additions & 3 deletions packages/crawler/src/crawler.ts
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ import path from 'path';

import { config, logger } from './config';
import { jobDurationSeconds, jobTotalLatencySeconds, jobsEnqueuedTotal, jobsTotal } from './metrics';
import { initPage } from './puppeteer';
import { closeBrowser, initPage, isBrowserConnectionError } from './puppeteer';
import { createCarbonImage } from './services/carbon';
import { convertJobToSnapshot, deleteSnapshots, formatSnapshot } from './services/snapshot';
import { Job, JobState, Snapshot, SnapshotModel, sequelize } from './store';
Expand Down Expand Up @@ -45,6 +45,7 @@ export function createCrawlQueue(queue: string, handler?: PageHandler) {
options: {
concurrency: config.concurrency,
enableScheduledJob: true,
maxRetries: 3,
},
onJob: async (job: JobState) => {
const startTime = Date.now();
Expand Down Expand Up @@ -260,6 +261,21 @@ export const getPageContent = async (
let screenshot: Uint8Array | null = null;
const meta: { title?: string; description?: string } = {};

const closePageSafely = async () => {
try {
await page.close();
} catch (error) {
if (isBrowserConnectionError(error)) {
try {
await closeBrowser({ trimCache: false });
} catch (closeError) {
logger.warn('Failed to close browser after page close error', { error: closeError });
}
}
logger.warn('Failed to close page:', { error });
}
};

try {
const response = await page.goto(url, { timeout });

Expand Down Expand Up @@ -359,11 +375,20 @@ export const getPageContent = async (
logger.error('Failed to get html:', err);
throw err;
}

await closePageSafely();
} catch (error) {
if (isBrowserConnectionError(error)) {
try {
await closeBrowser({ trimCache: false });
} catch (closeError) {
logger.warn('Failed to close browser after page error', { error: closeError });
}
} else {
await closePageSafely();
}
logger.error('Failed to get page content:', error);
throw error;
} finally {
await page.close();
}

return {
Expand Down
2 changes: 2 additions & 0 deletions packages/crawler/src/metrics.ts
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,8 @@ export async function collectMetrics() {
try {
// 收集队列大小
const jobStats = await Job.stats();
// Reset first to clear queues that no longer have jobs
queueSize.reset();
jobStats.queues.forEach((q) => {
queueSize.set({ queue: q.queue }, q.count);
});
Expand Down
Loading
Loading