I'm trying to use a set amount of tabs to open and operate on these while waiting for network IO to speed up my crawling process for a domain.I've solved the same issue in Go via worker pool listening on a channel but I'm not sure how I can solve the same problem with Node.js and Puppeteer.My guess is the looping viaasync function nextPage() { try { for (var link of uncrawledLinks.keys()) { if (runningThreads < maxThreads) { var page = await browser.newPage(); console.log("nextPage() # runningThreads: " + runningThreads + " # uncrawledLinks.size: " + uncrawledLinks.size); //debugger; crawlPage(page, link); } } that may create a problem when I increase "maxthreads" higher than 1 but I could't solve the issue till now.The problem that occurs when I set it higher than 1 is that links get crawled twice or threefold or fourfold (but not every link, only ~90% of the links) so that I get duplicates which makes the crawler unuseable.I thought about using a DB like Redis or SQLite but I wanted to solve the problem first without it to understand the problem better (and till now I don't have a performance/memory problem so doing it in-memory is no problem).Sample code that is runable:'use strict'; const puppeteer = require('puppeteer'); const url = require('url'); // start URL const startUrlObj = url.parse("http://example.de/"); const startUrlDomain = startUrlObj.protocol + "//" + startUrlObj.hostname; const startUrl = url.format(startUrlObj); let browser; let pages = []; let uncrawledLinks = new Map(); let crawledLinks = []; let runningThreads = 0; const maxThreads = 1; start(); async function start() { console.log("Starting Crawler"); browser = await puppeteer.launch(); console.log("Finished initializing browser object"); uncrawledLinks.set(startUrl, ""); nextPage(); }; async function crawlPage(page, link) { try { console.log("starting crawl for: " + link); runningThreads++; const response = await page.goto(link, { waitUntil: 'networkidle2', timeout: 30000 }); // find all links in the form const hrefs = await page.$$eval('a', as => as.map(a => a.href)); hrefs.forEach(function(foundLink, key) { if (foundLink.startsWith(startUrlDomain)) { var tempUrl = url.parse(foundLink); // remove #asd and ?param1=y values from URL tempUrl.hash = null; tempUrl.search = null; var tempLink = url.format(tempUrl); //console.log(url.format(tempLink)); if (crawledLinks.includes(tempLink) === false) { if (tempLink.endsWith(".html") === true) { uncrawledLinks.set(tempLink, "false"); //pages.push(tempLink); } } }}, hrefs) //console.log("Found new links: " + i + " # " + link); // crawling queues uncrawledLinks.delete(link); crawledLinks.push(link); } catch (erorr) { // Log errors console.error(error); } finally { runningThreads--; await page.close(); await nextPage(); } } async function nextPage() { try { for (var link of uncrawledLinks.keys()) { if (runningThreads < maxThreads) { var page = await browser.newPage(); //console.log("nextPage() # runningThreads: " + runningThreads + " # uncrawledLinks.size: " + uncrawledLinks.size); //debugger; crawlPage(page, link); } } } catch (erorr) { // Log errors console.error(error); } finally { if (uncrawledLinks.size === 0 && runningThreads === 0) { console.log("Finished crawling"); console.log(crawledLinks); await browser.close(); } } }
Submitted April 03, 2018 at 01:03PM by Yojihito
No comments:
Post a Comment