-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathwikiCrawler.py
More file actions
executable file
·107 lines (89 loc) · 4.1 KB
/
wikiCrawler.py
File metadata and controls
executable file
·107 lines (89 loc) · 4.1 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
#!/Library/Frameworks/Python.framework/Versions/3.4/bin/python3.4
from wikiArticleRequester import WikiArticleRequester
from wikiPageParser import WikiPageParser
from crawlerResults import CrawlerResult, CrawlerResults
import sys
class WikiCrawler:
pathCache = {}
requester = WikiArticleRequester()
parser = WikiPageParser()
def inefficientFindXPathsFromRandomArticles(self, numberOfArticles):
results = CrawlerResults()
for i in range(numberOfArticles):
results.add(crawler.findPathToPhilosophyFromRandomArticle())
return results
def findPathToPhilosophyFromRandomArticle(self):
randomTitle = self.requester.randomArticleTitle()
path = []
pathSet = set()
path.append(randomTitle)
pathSet.add(randomTitle)
while path[-1].lower() != 'philosophy':
rawArticle = self.requester.getRawArticle(path[-1])
firstLink = self.parser.getFirstLegitimateLinkTitleForString(rawArticle)
#if there is no link on article, we hit a dead end
if firstLink is None:
print('No First Link')
print(path)
return CrawlerResult(randomTitle, None)
path.append(firstLink)
#if we see the same article twice, we've hit an infinite loop
if firstLink in pathSet:
print('We hit a loop in: ')
print(path)
return CrawlerResult(randomTitle, None)
else:
pathSet.add(firstLink)
print('success!')
print(path)
return CrawlerResult(randomTitle, path)
def findPathToPhilosophyFromRandomArticles(self, numberOfArticles):
randomTitles = self.requester.getRandomArticleTitles(numberOfArticles)
successfulPathsDict = {}
failedPathSet = set()
results = CrawlerResults()
for randomTitle in randomTitles:
path = []
pathSet = set()
path.append(randomTitle)
pathSet.add(randomTitle)
while True:
if path[-1].lower() == 'philosophy':
print('success for: ' + str(path))
results.add(CrawlerResult(randomTitle, path))
for i,each in enumerate(path):
successfulPathsDict[each.lower()] = [each.lower() for each in path[i:]]
break
elif path[-1].lower() in failedPathSet:
print(randomTitle + ' failed based off cache: ' + str(path))
results.add(CrawlerResult(randomTitle, None))
break
elif path[-1].lower() in successfulPathsDict.keys():
path.extend(successfulPathsDict[path[-1].lower()][1:])
results.add(CrawlerResult(randomTitle, path))
print(randomTitle + ' success based off cache: ' + str(path))
break
else:
rawArticle = self.requester.getRawArticle(path[-1])
nextLink = self.parser.getFirstLegitimateLinkTitleForString(rawArticle)
if nextLink is None:
print('Dead end for:' + str(path))
results.add(CrawlerResult(randomTitle, None))
for each in path:
failedPathSet.add(each.lower())
break
elif nextLink in pathSet:
path.append(nextLink)
print('We hit a loop in: ' + str(path))
results.add(CrawlerResult(randomTitle, None))
for each in path:
failedPathSet.add(each.lower())
break
else:
path.append(nextLink)
pathSet.add(nextLink)
return results
crawler = WikiCrawler()
crawlResults = crawler.findPathToPhilosophyFromRandomArticles(int(sys.argv[1]))
crawlResults.printResultStats()
print('Number of HTTP requests: ' + str(crawler.requester.numberOfHttpRequests))