-
Notifications
You must be signed in to change notification settings - Fork 232
Expand file tree
/
Copy pathexample_cache.php
More file actions
71 lines (57 loc) · 2.67 KB
/
example_cache.php
File metadata and controls
71 lines (57 loc) · 2.67 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
<?php
/**
* Example demonstrating the CachedResourceFilter
*
* This shows how to use the cache filter to avoid re-downloading
* resources that are already cached and fresh.
*/
use VDB\Spider\Discoverer\XPathExpressionDiscoverer;
use VDB\Spider\Filter\Prefetch\AllowedHostsFilter;
use VDB\Spider\Filter\Prefetch\AllowedSchemeFilter;
use VDB\Spider\Filter\Prefetch\CachedResourceFilter;
use VDB\Spider\PersistenceHandler\FileSerializedResourcePersistenceHandler;
use VDB\Spider\Spider;
require_once(__DIR__ . '/example_complex_bootstrap.php');
// The URI we want to start crawling with
$seed = 'https://www.dmoz-odp.org/Computers/Internet/';
// Use a fixed spider ID so we can share cache across runs
$spiderId = 'example-cached-spider';
// Create spider with fixed ID
$spider = new Spider($seed, null, null, null, $spiderId);
$spider->getDownloader()->setDownloadLimit(5);
// Set some sane defaults for this example
$spider->getDiscovererSet()->setMaxDepth(1);
// Add URI discoverer
$spider->getDiscovererSet()->addDiscoverer(new XPathExpressionDiscoverer("//a[starts-with(@href, '/') or starts-with(@href, 'http')]"));
// Set up file persistence
$resultsPath = __DIR__ . '/cache';
$spider->getDownloader()->setPersistenceHandler(
new FileSerializedResourcePersistenceHandler($resultsPath)
);
// Add standard prefetch filters
$spider->getDiscovererSet()->addFilter(new AllowedSchemeFilter(array('http', 'https')));
$spider->getDiscovererSet()->addFilter(new AllowedHostsFilter(array($seed), true));
// Add the cache filter - resources cached within the last hour will be skipped
// Set maxAge to 0 to always use cache regardless of age
$maxAgeSeconds = 3600; // 1 hour
$cacheFilter = new CachedResourceFilter($resultsPath, $spiderId, $maxAgeSeconds);
$spider->getDiscovererSet()->addFilter($cacheFilter);
echo "\nStarting crawl with cache enabled (maxAge: {$maxAgeSeconds}s)...\n";
echo "Cache directory: {$resultsPath}/{$spiderId}\n";
echo "On first run, all resources will be downloaded.\n";
echo "On subsequent runs within {$maxAgeSeconds}s, cached resources will be skipped.\n\n";
// Execute the crawl
$spider->crawl();
echo "\nCrawl complete!\n";
echo "Persisted resources: " . $spider->getDownloader()->getPersistenceHandler()->count() . "\n";
// Show cache statistics
$cacheDir = $resultsPath . DIRECTORY_SEPARATOR . $spiderId;
if (is_dir($cacheDir)) {
$files = new RecursiveIteratorIterator(
new RecursiveDirectoryIterator($cacheDir, RecursiveDirectoryIterator::SKIP_DOTS),
RecursiveIteratorIterator::LEAVES_ONLY
);
$fileCount = iterator_count($files);
echo "Total files in cache: {$fileCount}\n";
echo "\nRun this example again to see the cache filter in action!\n";
}