add huaban processor
parent
fe224cbf66
commit
42508af041
@ -0,0 +1,45 @@
|
||||
package us.codecraft.webmagic.samples;
|
||||
|
||||
import us.codecraft.webmagic.Page;
|
||||
import us.codecraft.webmagic.Site;
|
||||
import us.codecraft.webmagic.Spider;
|
||||
import us.codecraft.webmagic.pipeline.FilePipeline;
|
||||
import us.codecraft.webmagic.processor.PageProcessor;
|
||||
import us.codecraft.webmagic.scheduler.RedisScheduler;
|
||||
import us.codecraft.webmagic.selenium.downloader.SeleniumDownloader;
|
||||
|
||||
/**
|
||||
* @author yihua.huang@dianping.com <br>
|
||||
* @date: 13-7-26 <br>
|
||||
* Time: 下午4:08 <br>
|
||||
*/
|
||||
public class HuabanProcessor implements PageProcessor {
|
||||
|
||||
private Site site;
|
||||
|
||||
@Override
|
||||
public void process(Page page) {
|
||||
page.addTargetRequests(page.getHtml().links().regex("http://huaban\\.com/.*").all());
|
||||
if (page.getUrl().toString().contains("pins")) {
|
||||
page.putField("img", page.getHtml().xpath("//div[@id='pin_img']/img/@src").toString());
|
||||
} else {
|
||||
page.getResultItems().setSkip(true);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public Site getSite() {
|
||||
if (site == null) {
|
||||
site = Site.me().setDomain("huaban.com").addStartUrl("http://huaban.com/");
|
||||
}
|
||||
return site;
|
||||
}
|
||||
|
||||
public static void main(String[] args) {
|
||||
Spider.create(new HuabanProcessor())
|
||||
.scheduler(new RedisScheduler("localhost"))
|
||||
.pipeline(new FilePipeline("/data/webmagic/test/"))
|
||||
.downloader(new SeleniumDownloader("/Users/yihua/Downloads/chromedriver"))
|
||||
.runAsync();
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue