web crawler multithreaded

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
/**
* // This is the HtmlParser's API interface.
* // You should not implement it, or speculate about its implementation
* interface HtmlParser {
* public List<String> getUrls(String url) {}
* }
*/

class Solution {
public List<String> crawl(String startUrl, HtmlParser htmlParser) {
String hostname = getHostname(startUrl);

Set<String> visited = ConcurrentHashMap.newKeySet();
visited.add(startUrl);

return crawl(startUrl, htmlParser, hostname, visited).collect(Collectors.toList());
}

private String getHostname(String url) {
int index = url.indexOf('/', 7);
return (index != -1) ? url.substring(0, index) : url;
}

private Stream<String> crawl(String startUrl, HtmlParser htmlParser, String hostname, Set<String> visited) {
Stream<String> stream = htmlParser.getUrls(startUrl)
.parallelStream()
.filter(url -> isSameHostname(url, hostname))
.filter(url -> visited.add(url))
.flatMap(url -> crawl(url, htmlParser, hostname, visited));

return Stream.concat(Stream.of(startUrl), stream);
}

private boolean isSameHostname(String url, String hostname) {
if (!url.startsWith(hostname)) return false;
return url.length() == hostname.length() || url.charAt(hostname.length()) == '/';
}
}

In Java8:

  1. ConcurrentHashMap.newKeySet() ~ map.keySet();
  2. Stream.collect(Collectors.xxx);
  3. list.parallelStream();
  4. stream: filter, flatMap;

Digesting…