fix huabanprocessor

pull/84/head
yihua.huang 11 years ago
parent 50cee4c7bb
commit a34e92d11a

@ -22,7 +22,7 @@ public class HuabanProcessor implements PageProcessor {
public void process(Page page) {
page.addTargetRequests(page.getHtml().links().regex("http://huaban\\.com/.*").all());
if (page.getUrl().toString().contains("pins")) {
page.putField("img", page.getHtml().xpath("//div[@id='pin_img']/img/@src").toString());
page.putField("img", page.getHtml().xpath("//div[@id='pin_img']/a/img/@src").toString());
} else {
page.getResultItems().setSkip(true);
}
@ -31,15 +31,16 @@ public class HuabanProcessor implements PageProcessor {
@Override
public Site getSite() {
if (site == null) {
site = Site.me().setDomain("huaban.com").addStartUrl("http://huaban.com/").setSleepTime(0);
site = Site.me().setDomain("huaban.com").setSleepTime(0);
}
return site;
}
public static void main(String[] args) {
Spider.create(new HuabanProcessor()).thread(5)
.pipeline(new FilePipeline("/data/webmagic/test/"))
.downloader(new SeleniumDownloader("/Users/yihua/Downloads/chromedriver"))
.addPipeline(new FilePipeline("/data/webmagic/test/"))
.setDownloader(new SeleniumDownloader("/Users/yihua/Downloads/chromedriver"))
.addUrl("http://huaban.com/")
.runAsync();
}
}

@ -45,7 +45,7 @@ WebMagic使用slf4j-log4j12作为slf4j的实现.如果你自己定制了slf4j的
在**lib**目录下有项目依赖的所有jar包直接在IDE里将这些jar添加到Libraries即可。
![import jars](http://static.oschina.net/uploads/space/2014/0403/102848_ETcU_190591.png)
![import jars](http://static.oschina.net/uploads/space/2014/0403/143318_gBQE_190591.jpeg)
### 1.3 第一个项目
@ -154,6 +154,8 @@ Intellij Idea默认自带Maven支持import项目时选择Maven项目即可。
![runlog](http://static.oschina.net/uploads/space/2014/0403/103741_3Gf5_190591.png)
<div style="page-break-after:always"></div>
## 3. 基本的爬虫
### 3.1 抽取内容(xpath, regex, css selector, jsonpath)

Loading…
Cancel
Save