Scrap websites with nodejs!

August 27th, 2011

I really wanted to experiment a little bit with nodejs.

Target

So some days ago a friend of mine told me that he used a cool python library in order to scrap from a site the information he wanted to! So that sounded to me like a very good exercise!

Solution

The idea is to use jQuery powerful selectors to extract whatever I wanted to from an html page. If I can do that, I can also extract the url of the next page and the put the mechanism to work again for the that page! The code I finally had to write was about 50 lines!

var jsdom = require( 'jsdom' ),
	//fs = require( 'fs' )
	//underscore = fs.readFileSync( './underscore.js' ).toString(),
	scrapy = function( conf, counter ) {
		var url = conf.url || null,
			getNextUrl = conf.getNextUrl || null,
			filterPage = conf.filterPage || null,
			finalDone = conf.done || null,
			counter = counter || 1,
			done = function() {
				counter--;
				if ( counter === 0 && finalDone !== null) {
					finalDone();
				}
			};

		jsdom.env({
			html: url,
			scripts: [ 'http://code.jquery.com/jquery-1.5.min.js' ],
			done: function( errors, window ) {
				if ( errors ) {
					console.log( errors );
				}
				var $ = window.$,
					nexturl;

				// First of all try to scrap the next url available
				if ( getNextUrl ) {
					nexturl = getNextUrl( $ );
					if ( nexturl ) {
						scrapy( { 
							url: nexturl, //getNextUrl( $ ), 
							getNextUrl: getNextUrl,
							filterPage: filterPage,
							done: done }, counter++	 );
					}
				}

				if ( filterPage ) {
					filterPage( $ );
				}
				else {
					console.log( 'No filterPage available' );
				}
				
				done();
			}
		});
		//console.log( 'Done with scrap of ' + url);
	};

exports.scrapy = scrapy;

The conf argument that scrapy function expects should have a url (the page we want to filter), a filterPage function that filters the page and a findNext function that filters the page to find the next url that should be scraped. If you want to you can also provide a done function which will get called only when all the scraps will finish! Here is an example:

var scrapy = require('./scrapy.js').scrapy,

	getAdFromTD = function( $td ) {
		var ad = {
				title: $td.find('a').html().replace(/\s*$/g, '')
			},
			type;

		$td = $td.next();
		ad.address = $td.find('a').html().replace( /<br>/g, ',' );

		$td = $td.next();
		type = $td.find('a').html().split( '<br>' ); //.replace(/<br>/g, ',').replace(/\n|\s{2,}/g, ''),
		ad.rooms = type[0].replace( /\sRooms[\n|\s]+/g,  '' );
		ad.floor = type[1].replace( /.\s Floor/g, '' );
		ad.space = type[2].replace( /[\n|\s]+$/g, '' );

		$td = $td.next();
		type = $td.find('a').html().split('<br>'); //.replace(/<br>/g, ',').replace(/\n|\s{2,}/g, '');
		ad.type = type[0].replace( /[\n|\s]+$/g, '' );
		ad.build = type[1].replace( /[\n|\s]+$/g, '' );
		ad.price = type[2];

		return ad;
	},
	ads = []; 

scrapy( {
		url: "http://www.homegate.ch/rent/apartment-and-house/region-zuerich/matching-list?a=default&tab=list&l=default&cid=1585974&ao=&am=Z%C3%BCrich&ep=1&ac=1.5&ad=2.0&incsubs=default&tid=1&fromItem=ctn-zh&ag=1000&ah=2000&be=",
		getNextUrl: function( $ ) {
				return $('a.forward.iconLink').attr( 'href' );
			},
		filterPage: function( $ ) {
				$('#objectList tr')
					.each( function() {
						var $td = $( this ).find( '.tdTitle' ),
							ad;
				
						if ( $td.size() == 1) {
							ad = getAdFromTD( $td ); 
							ads.push( ad );
							console.log( ad );
						}
					});
				console.log( 'Found ' + ads.length);
			},
		done: function() {
				console.log( 'Done! found ' + ads.length + ' ads in total!');
			} 
		} );
valotas.com v3.6.1 © Γιώργος Βαλοτάσιος - CSS inspired by Adam Wathan's blog
The greek name "Γιώργος" is also know as Yoryos, Georgios or just George which seems to be easier to most english speaking people. If you are trying to find out what Βαλοτασιος means, just think of it as Valotasios and you should be fine.