1242. Web Crawler multithreaded | Blogs ©️ Lei Xia

web crawler multithreaded

/**
 * // This is the HtmlParser's API interface.
 * // You should not implement it, or speculate about its implementation
 * interface HtmlParser {
 *     public List<String> getUrls(String url) {}
 * }
 */

class Solution {
    public List<String> crawl(String startUrl, HtmlParser htmlParser) {
        String hostname = getHostname(startUrl);
        
        Set<String> visited = ConcurrentHashMap.newKeySet();
        visited.add(startUrl);
        
        return crawl(startUrl, htmlParser, hostname, visited).collect(Collectors.toList());
    }
    
    private String getHostname(String url) {
        int index = url.indexOf('/', 7);
        return (index != -1) ? url.substring(0, index) : url;
    }
    
    private Stream<String> crawl(String startUrl, HtmlParser htmlParser, String hostname, Set<String> visited) {
        Stream<String> stream = htmlParser.getUrls(startUrl)
            .parallelStream()
            .filter(url -> isSameHostname(url, hostname))
            .filter(url -> visited.add(url))
            .flatMap(url -> crawl(url, htmlParser, hostname, visited));
        
        return Stream.concat(Stream.of(startUrl), stream);
    }
    
    private boolean isSameHostname(String url, String hostname) {
        if (!url.startsWith(hostname)) return false;
        return url.length() == hostname.length() || url.charAt(hostname.length()) == '/';
    }
}

In Java8:

ConcurrentHashMap.newKeySet() ~ map.keySet();
Stream.collect(Collectors.xxx);
list.parallelStream();
stream: filter, flatMap;

Digesting…