By: Alvaro "Blag" Tejada Galindo
Re-posted from: http://blagrants.blogspot.com/2014/07/web-scrapping-with-julia-and-phatomjs.html
As I have been reading some PhantomJS books and I’m always looking to develop something nice using Julia…I thought that integrate them would be an awesome idea -;)
I thought about Twitter and the hashtags…wouldn’t it be nice to write a PhantomJS script to webscrape Twitter and get all the hashtags that I have used?
For this particular script…I’m taking the hashtags from the first 5 Twitter pages linked to my profile…
Hashtags.js |
var system = require('system');
var webpage = require('webpage').create(); webpage.viewportSize = { width: 1280, height: 800 }; webpage.scrollPosition = { top: 0, left: 0 };
var userid = system.args[1]; var profileUrl = "http://www.twitter.com/" + userid;
webpage.open(profileUrl, function(status) { if (status === 'fail') { console.error('webpage did not open successfully'); phantom.exit(1); } var i = 0, top, queryFn = function() { return document.body.scrollHeight; }; setInterval(function() { top = webpage.evaluate(queryFn); i++; webpage.scrollPosition = { top: top + 1, left: 0 };
if (i >= 5) { var twitter = webpage.evaluate(function () { var twitter = []; forEach = Array.prototype.forEach; var tweets = document.querySelectorAll('[data-query-source="hashtag_click"]'); forEach.call(tweets, function(el) { twitter.push(el.innerText); }); return twitter; });
twitter.forEach(function(t) { console.log(t); });
phantom.exit(); } }, 3000); });
|
If we run this…we’re going to have this output…
Now…what I want to do with this information…is to send it to Julia…and get the most used hashtags…so I will summarize them and then get rid of the ones that only appear once…
Let’s see the Julia code…
Twitter_Hashtags.jl |
tweets = readall(`phantomjs Hashtags.js Blag`) tweets = split(tweets,"\n") hashtags = Dict() for hash in tweets try hashtags[hash] += 1 catch e hashtags[hash] = 1 end end
filter!((k,v)->v>1,hashtags)
for (k,v) in hashtags println("$k has been mentioned $v times") end
|
When we run this code…we’re going to have this output…
I still don’t know how to sort Dicts in Julia…so bear with me -:)
Anyway…by looking at the output…we can have my top 3 hashtags -;)
#LeapMotion ==> 14 times
#Flare3D ==> 11 times
#DevHangout ==> 8 times
Hope you like this and see you next time -:)
Greetings,
Blag.
Development Culture.