From ff134a4fb7b6c8737d1ea5876ec453d81b7c2afb Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Tue, 1 Apr 2014 08:07:40 +0800 Subject: [PATCH] add contributor --- README.md | 98 +++++++++++++++++++++++++++++++------------------------ 1 file changed, 56 insertions(+), 42 deletions(-) diff --git a/README.md b/README.md index a0c75b28..8d40752d 100644 --- a/README.md +++ b/README.md @@ -37,16 +37,29 @@ webmagic的github地址:[https://github.com/code4craft/webmagic](https://githu webmagic使用maven管理依赖,在项目中添加对应的依赖即可使用webmagic: - - us.codecraft - webmagic-core - 0.4.3 - - - us.codecraft - webmagic-extension - 0.4.3 - +```xml + + us.codecraft + webmagic-core + 0.4.3 + + + us.codecraft + webmagic-extension + 0.4.3 + +``` + +WebMagic 使用slf4j-log4j12作为slf4j的实现.如果你自己定制了slf4j的实现,请在项目中去掉此依赖。 + +```xml + + + org.slf4j + slf4j-log4j12 + + +``` #### 项目结构 @@ -83,30 +96,30 @@ webmagic还包含两个可用的扩展包,因为这两个包都依赖了比较 PageProcessor是webmagic-core的一部分,定制一个PageProcessor即可实现自己的爬虫逻辑。以下是抓取osc博客的一段代码: ```java - public class OschinaBlogPageProcesser implements PageProcessor { +public class OschinaBlogPageProcesser implements PageProcessor { - private Site site = Site.me().setDomain("my.oschina.net"); + private Site site = Site.me().setDomain("my.oschina.net"); - @Override - public void process(Page page) { - List links = page.getHtml().links().regex("http://my\\.oschina\\.net/flashsword/blog/\\d+").all(); - page.addTargetRequests(links); - page.putField("title", page.getHtml().xpath("//div[@class='BlogEntity']/div[@class='BlogTitle']/h1").toString()); - page.putField("content", page.getHtml().$("div.content").toString()); - page.putField("tags",page.getHtml().xpath("//div[@class='BlogTags']/a/text()").all()); - } + @Override + public void process(Page page) { + List links = page.getHtml().links().regex("http://my\\.oschina\\.net/flashsword/blog/\\d+").all(); + page.addTargetRequests(links); + page.putField("title", page.getHtml().xpath("//div[@class='BlogEntity']/div[@class='BlogTitle']/h1").toString()); + page.putField("content", page.getHtml().$("div.content").toString()); + page.putField("tags",page.getHtml().xpath("//div[@class='BlogTags']/a/text()").all()); + } - @Override - public Site getSite() { - return site; + @Override + public Site getSite() { + return site; - } + } - public static void main(String[] args) { - Spider.create(new OschinaBlogPageProcesser()).addUrl("http://my.oschina.net/flashsword/blog") - .addPipeline(new ConsolePipeline()).run(); - } + public static void main(String[] args) { + Spider.create(new OschinaBlogPageProcesser()).addUrl("http://my.oschina.net/flashsword/blog") + .addPipeline(new ConsolePipeline()).run(); } +} ``` @@ -121,24 +134,24 @@ Spider是爬虫的入口类。Pipeline是结果输出和持久化的接口,这 webmagic-extension包括了注解方式编写爬虫的方法,只需基于一个POJO增加注解即可完成一个爬虫。以下仍然是抓取oschina博客的一段代码,功能与OschinaBlogPageProcesser完全相同: ```java - @TargetUrl("http://my.oschina.net/flashsword/blog/\\d+") - public class OschinaBlog { +@TargetUrl("http://my.oschina.net/flashsword/blog/\\d+") +public class OschinaBlog { - @ExtractBy("//title") - private String title; + @ExtractBy("//title") + private String title; - @ExtractBy(value = "div.BlogContent",type = ExtractBy.Type.Css) - private String content; + @ExtractBy(value = "div.BlogContent",type = ExtractBy.Type.Css) + private String content; - @ExtractBy(value = "//div[@class='BlogTags']/a/text()", multi = true) - private List tags; + @ExtractBy(value = "//div[@class='BlogTags']/a/text()", multi = true) + private List tags; - public static void main(String[] args) { - OOSpider.create( - Site.me(), - new ConsolePageModelPipeline(), OschinaBlog.class).addUrl("http://my.oschina.net/flashsword/blog").run(); - } - } + public static void main(String[] args) { + OOSpider.create( + Site.me(), + new ConsolePageModelPipeline(), OschinaBlog.class).addUrl("http://my.oschina.net/flashsword/blog").run(); + } +} ``` 这个例子定义了一个Model类,Model类的字段'title'、'content'、'tags'均为要抽取的属性。这个类在Pipeline里是可以复用的。 @@ -175,6 +188,7 @@ webmagic遵循[Apache 2.0协议](http://opensource.org/licenses/Apache-2.0) * [ywooer](https://github.com/ywooer) * [yyw258520](https://github.com/yyw258520) * [perfecking](https://github.com/perfecking) +* [ccliangbo](https://github.com/ccliangbo) * [lidongyang](http://my.oschina.net/lidongyang) ### 邮件组: