昨天CNBETA突然改版了,实在不清爽了,引大堆人吐槽,正好,前阵看了下node,发现抓取网页超级简单,就自己抓取下吧,几十行代码,比ruby方便好多,主要cheerio用起来太顺手,而且性能很好
index.js
var util = require('../lib/util.js'); var cheerio = require('cheerio'); exports.index = function(req, res) { var url = "http://www.cnbeta.com"; util.get(url, function(content, status) { var $ = cheerio.load(content); var realtime_list = $('.realtime_list').html(); var news_list = $("#allnews_all").html(); res.render("index", {realtime_list: realtime_list, news: news_list}) }); };
lib/util.js
var util = function() {}; var http = require('http'); var request = require('request'); util.prototype.get = function(url, callback) { request(url, function(error, response, body) { if (!error && response.statusCode == 200) { callback(body, response.statusCode); } }) } module.exports = new util();
其它没什么了,比ruby抓取简单太多,DEMO:http://cnbeta-news.herokuapp.com/, 完整代码:https://github.com/doabit/cnbeta_news_spider