-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathtest_ultra_scraper.py
More file actions
495 lines (412 loc) · 19.4 KB
/
test_ultra_scraper.py
File metadata and controls
495 lines (412 loc) · 19.4 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
#!/usr/bin/env python3
"""
Comprehensive Test Suite for Ultra-Modern LinkedIn Scrapers
Tests all scraping methods including Scrapy, Playwright, and Selenium
"""
import os
import sys
import json
import time
from datetime import datetime
from typing import Dict, List
# Add current directory to path for imports
sys.path.append(os.path.dirname(os.path.abspath(__file__)))
from dotenv import load_dotenv
load_dotenv()
class UltraScraperTestSuite:
"""Comprehensive test suite for all scraping methods"""
def __init__(self):
self.results = []
self.start_time = None
def log_test(self, test_name: str, success: bool, message: str = "", data: Dict = None):
"""Log test results"""
result = {
"test": test_name,
"success": success,
"message": message,
"timestamp": datetime.now().isoformat(),
"data": data
}
self.results.append(result)
status = "✅ PASS" if success else "❌ FAIL"
print(f"{status} {test_name}: {message}")
def run_all_tests(self):
"""Run comprehensive scraper tests"""
print("🚀 Ultra-Modern LinkedIn Scraper Test Suite")
print("=" * 70)
self.start_time = time.time()
# Test URLs with different challenges
test_urls = [
{
"url": "https://www.linkedin.com/in/liveankit",
"name": "Ankit Kumar Profile",
"expected_challenges": ["security_verification"]
},
{
"url": "https://in.linkedin.com/in/hiren-danecha-695a51110",
"name": "Hiren Danecha Profile",
"expected_challenges": ["country_domain"]
},
{
"url": "https://www.linkedin.com/in/williamhgates/",
"name": "Bill Gates Profile",
"expected_challenges": ["high_profile"]
}
]
# Test 1: Scrapy-based scraper
self.test_scrapy_scraper(test_urls)
# Test 2: Modern scraper with ultra methods
self.test_modern_scraper(test_urls)
# Test 3: Individual scraping methods
self.test_individual_methods(test_urls[0]["url"])
# Test 4: Error handling and edge cases
self.test_error_handling()
# Test 5: Performance benchmarks
self.test_performance(test_urls[0]["url"])
# Generate comprehensive report
self.generate_report()
def test_scrapy_scraper(self, test_urls: List[Dict]):
"""Test Scrapy-based scraper"""
print("\n🕷️ Testing Scrapy-Based Scraper")
print("-" * 50)
try:
# Test if Scrapy scraper can be imported
try:
from scrapy_linkedin_scraper import scrape_single_linkedin_profile
scrapy_available = True
except ImportError as e:
self.log_test("Scrapy Import", False, f"Import failed: {e}")
scrapy_available = False
if scrapy_available:
self.log_test("Scrapy Import", True, "Successfully imported Scrapy scraper")
# Test each URL
successful_scrapes = 0
for test_case in test_urls:
url = test_case["url"]
name = test_case["name"]
print(f"\n🔍 Testing Scrapy with {name}")
try:
start_time = time.time()
result = scrape_single_linkedin_profile(url)
duration = time.time() - start_time
if result.get('success'):
successful_scrapes += 1
self.log_test(
f"Scrapy - {name}",
True,
f"Success in {duration:.2f}s - {result.get('full_name', 'N/A')}",
{"duration": duration, "method": result.get('scraping_method')}
)
else:
self.log_test(
f"Scrapy - {name}",
False,
f"Failed: {result.get('error', 'Unknown error')}"
)
except Exception as e:
self.log_test(f"Scrapy - {name}", False, f"Exception: {str(e)}")
# Overall Scrapy success rate
success_rate = (successful_scrapes / len(test_urls)) * 100
self.log_test(
"Scrapy Overall",
successful_scrapes > 0,
f"Success rate: {success_rate:.1f}% ({successful_scrapes}/{len(test_urls)})",
{"success_rate": success_rate}
)
except Exception as e:
self.log_test("Scrapy Test Suite", False, f"Test suite error: {e}")
def test_modern_scraper(self, test_urls: List[Dict]):
"""Test modern scraper with ultra methods"""
print("\n🚀 Testing Modern Scraper with Ultra Methods")
print("-" * 50)
try:
from scraper_modern import scrape_linkedin_profile_modern
successful_scrapes = 0
for test_case in test_urls:
url = test_case["url"]
name = test_case["name"]
print(f"\n🔍 Testing Modern Scraper with {name}")
try:
start_time = time.time()
result = scrape_linkedin_profile_modern(url)
duration = time.time() - start_time
scraping_info = result.get('_scraping_info', {})
if scraping_info.get('success'):
successful_scrapes += 1
self.log_test(
f"Modern - {name}",
True,
f"Success in {duration:.2f}s - Method: {scraping_info.get('method')}",
{"duration": duration, "method": scraping_info.get('method')}
)
else:
self.log_test(
f"Modern - {name}",
False,
f"Failed with method: {scraping_info.get('method')}"
)
except Exception as e:
self.log_test(f"Modern - {name}", False, f"Exception: {str(e)}")
# Overall modern scraper success rate
success_rate = (successful_scrapes / len(test_urls)) * 100
self.log_test(
"Modern Scraper Overall",
successful_scrapes > 0,
f"Success rate: {success_rate:.1f}% ({successful_scrapes}/{len(test_urls)})",
{"success_rate": success_rate}
)
except ImportError as e:
self.log_test("Modern Scraper Import", False, f"Import failed: {e}")
except Exception as e:
self.log_test("Modern Scraper Test", False, f"Test error: {e}")
def test_individual_methods(self, test_url: str):
"""Test individual scraping methods"""
print("\n🔧 Testing Individual Scraping Methods")
print("-" * 50)
methods_to_test = [
("Selenium Undetected", self._test_selenium_method),
("Playwright Local", self._test_playwright_method),
("HTTP Requests", self._test_http_method),
("Mobile Simulation", self._test_mobile_method)
]
for method_name, test_func in methods_to_test:
try:
print(f"\n🔍 Testing {method_name}")
success, message, duration = test_func(test_url)
self.log_test(
f"Method - {method_name}",
success,
f"{message} (Duration: {duration:.2f}s)",
{"duration": duration}
)
except Exception as e:
self.log_test(f"Method - {method_name}", False, f"Exception: {str(e)}")
def _test_selenium_method(self, url: str):
"""Test Selenium undetected method"""
try:
from scraper_selenium import scrape_linkedin_profile_selenium
start_time = time.time()
result = scrape_linkedin_profile_selenium(url)
duration = time.time() - start_time
if result and not result.get("not_found"):
return True, f"Success - {result.get('full_name', 'N/A')}", duration
else:
return False, "No valid data returned", duration
except ImportError:
return False, "Selenium not available", 0
except Exception as e:
return False, f"Error: {str(e)}", 0
def _test_playwright_method(self, url: str):
"""Test Playwright local method"""
try:
from scraper_local import scrape_linkedin_profile_local
start_time = time.time()
result = scrape_linkedin_profile_local(url)
duration = time.time() - start_time
if result and not result.get("not_logged_in") and not result.get("not_found"):
return True, f"Success - {result.get('full_name', 'N/A')}", duration
else:
return False, "Login required or not found", duration
except ImportError:
return False, "Playwright not available", 0
except Exception as e:
return False, f"Error: {str(e)}", 0
def _test_http_method(self, url: str):
"""Test HTTP requests method"""
try:
import requests
from fake_useragent import UserAgent
from bs4 import BeautifulSoup
ua = UserAgent()
headers = {
'User-Agent': ua.random,
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'
}
start_time = time.time()
response = requests.get(url, headers=headers, timeout=10)
duration = time.time() - start_time
if response.status_code == 200:
soup = BeautifulSoup(response.content, 'html.parser')
title = soup.find('title')
if title and 'linkedin' in title.get_text().lower():
return True, "HTTP request successful", duration
return False, f"HTTP {response.status_code}", duration
except Exception as e:
return False, f"Error: {str(e)}", 0
def _test_mobile_method(self, url: str):
"""Test mobile user agent method"""
try:
import requests
mobile_ua = 'Mozilla/5.0 (iPhone; CPU iPhone OS 17_0 like Mac OS X) AppleWebKit/605.1.15'
headers = {'User-Agent': mobile_ua}
start_time = time.time()
response = requests.get(url, headers=headers, timeout=10)
duration = time.time() - start_time
if response.status_code == 200:
return True, "Mobile simulation successful", duration
else:
return False, f"Mobile HTTP {response.status_code}", duration
except Exception as e:
return False, f"Error: {str(e)}", 0
def test_error_handling(self):
"""Test error handling with invalid inputs"""
print("\n🛡️ Testing Error Handling")
print("-" * 50)
error_test_cases = [
("Invalid URL", "https://invalid-url.com"),
("Non-LinkedIn URL", "https://google.com"),
("Malformed LinkedIn URL", "https://linkedin.com/invalid"),
("Empty String", ""),
("None Input", None)
]
try:
from scraper_modern import scrape_linkedin_profile_modern
handled_errors = 0
for test_name, test_input in error_test_cases:
try:
if test_input is None:
continue # Skip None test for now
result = scrape_linkedin_profile_modern(test_input)
# Should handle gracefully without crashing
if isinstance(result, dict):
handled_errors += 1
self.log_test(f"Error Handling - {test_name}", True, "Handled gracefully")
else:
self.log_test(f"Error Handling - {test_name}", False, "Invalid response type")
except Exception as e:
self.log_test(f"Error Handling - {test_name}", False, f"Unhandled exception: {str(e)}")
success_rate = (handled_errors / (len(error_test_cases) - 1)) * 100 # -1 for None test
self.log_test(
"Error Handling Overall",
handled_errors > 0,
f"Handled {handled_errors}/{len(error_test_cases)-1} error cases ({success_rate:.1f}%)"
)
except ImportError as e:
self.log_test("Error Handling Test", False, f"Import failed: {e}")
def test_performance(self, test_url: str):
"""Test performance benchmarks"""
print("\n⚡ Testing Performance")
print("-" * 50)
try:
from scraper_modern import scrape_linkedin_profile_modern
# Run multiple iterations for average
iterations = 3
total_time = 0
successful_runs = 0
for i in range(iterations):
try:
print(f"Performance test {i+1}/{iterations}")
start_time = time.time()
result = scrape_linkedin_profile_modern(test_url)
duration = time.time() - start_time
total_time += duration
scraping_info = result.get('_scraping_info', {})
if scraping_info.get('success'):
successful_runs += 1
except Exception as e:
print(f"Performance test {i+1} failed: {e}")
if successful_runs > 0:
avg_time = total_time / successful_runs
# Performance ratings
if avg_time < 10:
rating = "Excellent"
elif avg_time < 30:
rating = "Good"
elif avg_time < 60:
rating = "Acceptable"
else:
rating = "Needs Improvement"
self.log_test(
"Performance Benchmark",
True,
f"Average: {avg_time:.2f}s - {rating} ({successful_runs}/{iterations} successful)",
{"avg_time": avg_time, "rating": rating, "success_rate": successful_runs/iterations}
)
else:
self.log_test("Performance Benchmark", False, "No successful runs for performance measurement")
except ImportError as e:
self.log_test("Performance Test", False, f"Import failed: {e}")
def generate_report(self):
"""Generate comprehensive test report"""
total_time = time.time() - self.start_time
print("\n" + "=" * 70)
print("📊 ULTRA-MODERN SCRAPER TEST REPORT")
print("=" * 70)
# Summary statistics
total_tests = len(self.results)
passed_tests = sum(1 for r in self.results if r["success"])
failed_tests = total_tests - passed_tests
success_rate = (passed_tests / total_tests) * 100 if total_tests > 0 else 0
print(f"📈 Total Tests: {total_tests}")
print(f"✅ Passed: {passed_tests}")
print(f"❌ Failed: {failed_tests}")
print(f"📊 Success Rate: {success_rate:.1f}%")
print(f"⏱️ Total Time: {total_time:.2f}s")
# Category breakdown
categories = {}
for result in self.results:
category = result["test"].split(" - ")[0]
if category not in categories:
categories[category] = {"passed": 0, "total": 0}
categories[category]["total"] += 1
if result["success"]:
categories[category]["passed"] += 1
print(f"\n📋 Category Breakdown:")
print("-" * 70)
for category, stats in categories.items():
rate = (stats["passed"] / stats["total"]) * 100
print(f"{category:25} {stats['passed']:2}/{stats['total']:2} ({rate:5.1f}%)")
# Recommendations
print(f"\n💡 Recommendations:")
print("-" * 70)
if failed_tests == 0:
print("🎉 All tests passed! Your scraping system is working perfectly.")
print("✨ Features working:")
print(" • Multi-method scraping with fallbacks")
print(" • Advanced anti-detection techniques")
print(" • Robust error handling")
print(" • Performance optimization")
else:
print("🔧 Areas for improvement:")
failed_categories = []
for result in self.results:
if not result["success"]:
category = result["test"].split(" - ")[0]
if category not in failed_categories:
failed_categories.append(category)
for category in failed_categories:
if "Scrapy" in category:
print(" • Install Scrapy dependencies: pip install scrapy scrapy-playwright")
elif "Modern" in category:
print(" • Check modern scraper configuration")
elif "Method" in category:
print(" • Individual methods may need authentication setup")
elif "Error" in category:
print(" • Improve error handling for edge cases")
# Save detailed report
report_data = {
"timestamp": datetime.now().isoformat(),
"version": "ultra_modern_v1.0",
"summary": {
"total_tests": total_tests,
"passed_tests": passed_tests,
"failed_tests": failed_tests,
"success_rate": success_rate,
"total_time": total_time
},
"categories": categories,
"results": self.results
}
with open("ultra_scraper_test_report.json", "w") as f:
json.dump(report_data, f, indent=2)
print(f"\n💾 Detailed report saved to: ultra_scraper_test_report.json")
def main():
"""Main test runner"""
print("🚀 Ultra-Modern LinkedIn Scraper - Comprehensive Test Suite")
print("🔧 Testing Scrapy, Playwright, Selenium, and advanced techniques...")
print()
# Run the comprehensive test suite
suite = UltraScraperTestSuite()
suite.run_all_tests()
if __name__ == "__main__":
main()