i tring scrape few sites. here code:
for (var = 0; < urls.length; i++) { url = urls[i]; console.log("start scraping: " + url); page.open(url, function () { waitfor(function() { return page.evaluate(function() { return document.getelementbyid("progresswrapper").childnodes.length == 1; }); }, function() { var price = page.evaluate(function() { // return price; }); console.log(price); result = url + " ; " + price; output = output + "\r\n" + result; }); }); } fs.write('test.txt', output); phantom.exit();
i want scrape sites in array urls, extract information , write information text file.
but there seems problem loop. when scraping 1 site without using loop, works want. loop, first nothing happens, line
console.log("start scraping: " + url);
is shown, 1 time much. if url = {a,b,c}, phantomjs does:
start scraping: start scraping: b start scraping: c start scraping:
it seems page.open isn't called @ all. newbie js sorry stupid question.
phantomjs asynchronous. calling page.open()
multiple times using loop, rush execution of callback. you're overwriting current request before finished new request again overwritten. need execute them 1 after other, example this:
page.open(url, function () { waitfor(function() { // }, function() { page.open(url, function () { waitfor(function() { // }, function() { // , on }); }); }); });
but tedious. there utilities can writing nicer code async.js. can install in directory of phantomjs script through npm.
var async = require("async"); // install async through npm var tests = urls.map(function(url){ return function(callback){ page.open(url, function () { waitfor(function() { // }, function() { callback(); }); }); }; }); async.series(tests, function finish(){ fs.write('test.txt', output); phantom.exit(); });
if don't want dependencies, easy define own recursive function (from here):
var urls = [/*....*/]; function handle_page(url){ page.open(url, function(){ waitfor(function() { // }, function() { next_page(); }); }); } function next_page(){ var url = urls.shift(); if(!urls){ phantom.exit(0); } handle_page(url); } next_page();
Comments
Post a Comment