diff --git a/README.md b/README.md index 8724e890..e92a4405 100644 --- a/README.md +++ b/README.md @@ -29,11 +29,9 @@ webmagic的功能覆盖整个爬虫的生命周期(链接提取、页面下载 ###Get Started -webmagic定制的核心是PageProcessor接口。一个最简单的webmagic爬虫例子是这样的: - - Spider.me().processor(new SimplePageProcessor("http://my.oschina.net/", "http://my.oschina.net/*/blog/*")).run(); +webmagic定制的核心是PageProcessor接口。 -其中SimplePageProcessor实现如下: +例如,我们要实现一个简单的通用爬虫SimplePageProcessor,代码如下: public class SimplePageProcessor implements PageProcessor { @@ -53,16 +51,25 @@ webmagic定制的核心是PageProcessor接口。一个最简单的webmagic爬虫 @Override public void process(Page page) { List requests = page.getHtml().as().rs(urlPattern).toStrings(); + //调用page.addTargetRequests()方法添加待抓取链接 page.addTargetRequests(requests); + //xpath方式抽取 page.putField("title", page.getHtml().x("//title")); + //sc表示使用Readability技术抽取正文 page.putField("content", page.getHtml().sc()); } @Override public Site getSite() { + //定义抽取站点的相关参数 return site; } } + +调用这个爬虫的代码如下: + + Spider.me().processor(new SimplePageProcessor("http://my.oschina.net/", "http://my.oschina.net/*/blog/*")).run(); + ### 示例 diff --git a/pom.xml b/pom.xml index 3772cd01..68927f20 100644 --- a/pom.xml +++ b/pom.xml @@ -5,7 +5,7 @@ us.codecraft 0.0.1-SNAPSHOT 4.0.0 - + pom webmagic diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/processor/SimplePageProcessor.java b/webmagic-core/src/main/java/us/codecraft/webmagic/processor/SimplePageProcessor.java index a8165bb4..eb8f56ea 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/processor/SimplePageProcessor.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/processor/SimplePageProcessor.java @@ -29,13 +29,17 @@ public class SimplePageProcessor implements PageProcessor { @Override public void process(Page page) { List requests = page.getHtml().as().rs(urlPattern).toStrings(); + //调用page.addTargetRequests()方法添加待抓取链接 page.addTargetRequests(requests); + //xpath方式抽取 page.putField("title", page.getHtml().x("//title")); + //sc表示使用Readability技术抽取正文 page.putField("content", page.getHtml().sc()); } @Override public Site getSite() { + //定义抽取站点的相关参数 return site; } }