Hey,I've written a basic web scraper, it works great, but there's something I couldn't understand.When it's done, it starts doing the job again, why? Is it something to do with activating it by visiting the url?Beware, it's not well structured and messy. If you get blind while reading it, I take no responsibilities.app.get('/async20', function(req, res) { var people = []; for (var a = 0; a < 100; a++) { people.push("irrelevant url is here); } async.mapLimit(people, 20, function(url, callback) { // iterator function var options2 = { url: url, headers: { 'User-Agent': req.headers['user-agent'], 'Content-Type': 'application/json; charset=utf-8' } }; request(options2, function(error, response, body) { if (!error && response.statusCode == 200) { async.series([ // add person function(callback) { var $ = cheerio.load(body); var name = entities.decodeHTML($('span[itemprop="personname"]').text()); // add person - db operations // add url as a uq. column new person({ name: name, url: url }).save().then(function(ehi1) { console.log("success"); }); async.setImmediate(function() { callback(); }); }, function(callback) { async.waterfall([ function(callback) { var $ = cheerio.load(body); var jobs = $('span[itemprop="persontitle"]').length; var jobsArr = []; for (var aa = 0; aa < jobs; aa++) { jobsArr.push(entities.decodeHTML($('span[itemprop="persontitle"]').eq(aa).text())); } async.setImmediate(function() { callback(null, jobsArr); }); }, function(jobsArr, callback) { async.map(jobsArr, function(jobs, callback) { personRole.where('name', jobs).fetch({ require: true }).then(function(data1) { data1 = data1.toJSON(); person.where('url', url).fetch().then(function(data2) { data2 = data2.toJSON(); new personPersonRole({ person_id: data2.id, personrole_id: data1.id }).save().then(function(ehi2) { console.log("success 1"); }); }); }).catch(function(err) { new personRole({ name: jobs }).save().then(function(data3) { data3 = data3.toJSON(); person.where('url', url).fetch().then(function(data4) { data4 = data4.toJSON(); new personPersonRole({ person_id: data4.id, personrole_id: data3.id }).save().then(function(ehi3) { console.log("success 3"); }); }); }); }); }); async.setImmediate(function() { callback(null, "yes"); }); } ], function(err, result) { if (err) { console.log(err); } }); async.setImmediate(function() { callback(); }); } ], function(err, result) { if (err) { console.log("err3"); } }); } else { console.log("err4"); } }); async.setImmediate(function() { callback(); }); }); });
Submitted March 04, 2016 at 02:33AM by laraveling
No comments:
Post a Comment