-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathWebCrawler2.java
More file actions
40 lines (33 loc) · 1.18 KB
/
WebCrawler2.java
File metadata and controls
40 lines (33 loc) · 1.18 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.io.IOException;
import java.util.HashSet;
public class WebCrawler2 {
private static final int MAX_DEPTH = 2;
private HashSet<String> links;
public WebCrawler2() {
links = new HashSet<>();
}
public void getPageLinks(String URL, int depth) {
if ((!links.contains(URL) && (depth < MAX_DEPTH))) {
//System.out.println(">> Depth: " + depth + " [" + URL + "]");
System.out.println(URL);
try {
links.add(URL);
Document document = Jsoup.connect(URL).get();
Elements linksOnPage = document.select("a[href]");
depth++;
for (Element page : linksOnPage) {
getPageLinks(page.attr("abs:href"), depth);
}
} catch (IOException e) {
System.err.println("For '" + URL + "': " + e.getMessage());
}
}
}
public static void main(String[] args) {
new WebCrawler2().getPageLinks("https://en.wikipedia.org/wiki/Kobe_Bryant", 0);
}
}