diff --git a/README.md b/README.md index cebaecd9..cc63925d 100644 --- a/README.md +++ b/README.md @@ -25,12 +25,12 @@ Add dependencies to your pom.xml: us.codecraft webmagic-core - 0.4.3 + 0.5.0 us.codecraft webmagic-extension - 0.4.3 + 0.5.0 ``` diff --git a/en_docs/README.md b/en_docs/README.md index cccbf3f8..cc63925d 100644 --- a/en_docs/README.md +++ b/en_docs/README.md @@ -1,4 +1,4 @@ -![logo](https://raw.github.com/code4craft/webmagic/master/asserts/logo.jpg) +![logo](https://raw.github.com/code4craft/webmagic/master/assets/logo.jpg) [Readme in Chinese](https://github.com/code4craft/webmagic/tree/master/zh_docs) @@ -21,48 +21,63 @@ Add dependencies to your pom.xml: - - us.codecraft - webmagic-core - 0.4.3 - - - us.codecraft - webmagic-extension - 0.4.3 - +```xml + + us.codecraft + webmagic-core + 0.5.0 + + + us.codecraft + webmagic-extension + 0.5.0 + +``` + +WebMagic use slf4j with slf4j-log4j12 implementation. If you customized your slf4j implementation, please exclude slf4j-log4j12. + +```xml + + + org.slf4j + slf4j-log4j12 + + +``` + ## Get Started: ### First crawler: -Write a class implements PageProcessor: +Write a class implements PageProcessor. For example, I wrote a crawler of github repository infomation. ```java - public class OschinaBlogPageProcesser implements PageProcessor { - - private Site site = Site.me().setDomain("my.oschina.net"); - - @Override - public void process(Page page) { - List links = page.getHtml().links().regex("http://my\\.oschina\\.net/flashsword/blog/\\d+").all(); - page.addTargetRequests(links); - page.putField("title", page.getHtml().xpath("//div[@class='BlogEntity']/div[@class='BlogTitle']/h1").toString()); - page.putField("content", page.getHtml().$("div.content").toString()); - page.putField("tags",page.getHtml().xpath("//div[@class='BlogTags']/a/text()").all()); +public class GithubRepoPageProcessor implements PageProcessor { + + private Site site = Site.me().setRetryTimes(3).setSleepTime(1000); + + @Override + public void process(Page page) { + page.addTargetRequests(page.getHtml().links().regex("(https://github\\.com/\\w+/\\w+)").all()); + page.putField("author", page.getUrl().regex("https://github\\.com/(\\w+)/.*").toString()); + page.putField("name", page.getHtml().xpath("//h1[@class='entry-title public']/strong/a/text()").toString()); + if (page.getResultItems().get("name")==null){ + //skip this page + page.setSkip(true); } + page.putField("readme", page.getHtml().xpath("//div[@id='readme']/tidyText()")); + } - @Override - public Site getSite() { - return site; - - } + @Override + public Site getSite() { + return site; + } - public static void main(String[] args) { - Spider.create(new OschinaBlogPageProcesser()).addUrl("http://my.oschina.net/flashsword/blog") - .addPipeline(new ConsolePipeline()).run(); - } + public static void main(String[] args) { + Spider.create(new GithubRepoPageProcessor()).addUrl("https://github.com/code4craft").thread(5).run(); } +} ``` * `page.addTargetRequests(links)` @@ -72,28 +87,31 @@ Write a class implements PageProcessor: You can also use annotation way: ```java - @TargetUrl("http://my.oschina.net/flashsword/blog/\\d+") - public class OschinaBlog { +@TargetUrl("https://github.com/\\w+/\\w+") +@HelpUrl("https://github.com/\\w+") +public class GithubRepo { - @ExtractBy("//title") - private String title; + @ExtractBy(value = "//h1[@class='entry-title public']/strong/a/text()", notNull = true) + private String name; - @ExtractBy(value = "div.BlogContent",type = ExtractBy.Type.Css) - private String content; + @ExtractByUrl("https://github\\.com/(\\w+)/.*") + private String author; - @ExtractBy(value = "//div[@class='BlogTags']/a/text()", multi = true) - private List tags; + @ExtractBy("//div[@id='readme']/tidyText()") + private String readme; - public static void main(String[] args) { - OOSpider.create( - Site.me(), - new ConsolePageModelPipeline(), OschinaBlog.class).addUrl("http://my.oschina.net/flashsword/blog").run(); - } - } + public static void main(String[] args) { + OOSpider.create(Site.me().setSleepTime(1000) + , new ConsolePageModelPipeline(), GithubRepo.class) + .addUrl("https://github.com/code4craft").thread(5).run(); + } +} ``` ### Docs and samples: +Documents: [http://webmagic.io/docs/](http://webmagic.io/docs/) + The architecture of webmagic (refered to [Scrapy](http://scrapy.org/)) ![image](http://code4craft.github.io/images/posts/webmagic.png) @@ -110,6 +128,7 @@ Lisenced under [Apache 2.0 lisence](http://opensource.org/licenses/Apache-2.0) Thanks these people for commiting source code, reporting bugs or suggesting for new feature: +* [ccliangbo](https://github.com/ccliangbo) * [yuany](https://github.com/yuany) * [yxssfxwzy](https://github.com/yxssfxwzy) * [linkerlin](https://github.com/linkerlin) @@ -124,6 +143,8 @@ Thanks these people for commiting source code, reporting bugs or suggesting for * [yyw258520](https://github.com/yyw258520) * [perfecking](https://github.com/perfecking) * [lidongyang](http://my.oschina.net/lidongyang) +* [seveniu](https://github.com/seveniu) +* [sebastian1118](https://github.com/sebastian1118) ### Thanks: @@ -146,6 +167,10 @@ To write webmagic, I refered to the projects below : [https://groups.google.com/forum/#!forum/webmagic-java](https://groups.google.com/forum/#!forum/webmagic-java) +[http://list.qq.com/cgi-bin/qf_invite?id=023a01f505246785f77c5a5a9aff4e57ab20fcdde871e988](http://list.qq.com/cgi-bin/qf_invite?id=023a01f505246785f77c5a5a9aff4e57ab20fcdde871e988) + +QQ Group: 373225642 + [![Bitdeli Badge](https://d2weczhvl823v0.cloudfront.net/code4craft/webmagic/trend.png)](https://bitdeli.com/free "Bitdeli Badge") diff --git a/webmagic-avalon/forger/pom.xml b/webmagic-avalon/forger/pom.xml index 44b42f9f..89796d15 100644 --- a/webmagic-avalon/forger/pom.xml +++ b/webmagic-avalon/forger/pom.xml @@ -7,7 +7,7 @@ us.codecraft forger - 0.1.1-SNAPSHOT + 0.1.0 4.0.0 jar diff --git a/webmagic-avalon/webmagic-avalon-common/pom.xml b/webmagic-avalon/webmagic-avalon-common/pom.xml index 3b068990..92f56314 100644 --- a/webmagic-avalon/webmagic-avalon-common/pom.xml +++ b/webmagic-avalon/webmagic-avalon-common/pom.xml @@ -26,7 +26,7 @@ us.codecraft forger - 0.1.1-SNAPSHOT + 0.1.0 diff --git a/zh_docs/README.md b/zh_docs/README.md index b336367d..cee747c9 100644 --- a/zh_docs/README.md +++ b/zh_docs/README.md @@ -1,4 +1,4 @@ -![logo](https://raw.github.com/code4craft/webmagic/master/asserts/logo.jpg) +![logo](https://raw.github.com/code4craft/webmagic/master/assets/logo.jpg) [![Build Status](https://travis-ci.org/code4craft/webmagic.png?branch=master)](https://travis-ci.org/code4craft/webmagic) @@ -6,7 +6,7 @@ [Readme in English](https://github.com/code4craft/webmagic/tree/master/en_docs) -[用户手册](https://github.com/code4craft/webmagic/blob/master/user-manual.md) +官方网站[http://webmagic.io/](http://webmagic.io/) >webmagic是一个开源的Java垂直爬虫框架,目标是简化爬虫的开发流程,让开发者专注于逻辑功能的开发。webmagic的核心非常简单,但是覆盖爬虫的整个流程,也是很好的学习爬虫开发的材料。作者曾经在前公司进行过一年的垂直爬虫的开发,webmagic就是为了解决爬虫开发的一些重复劳动而产生的框架。 @@ -41,12 +41,12 @@ webmagic使用maven管理依赖,在项目中添加对应的依赖即可使用w us.codecraft webmagic-core - 0.4.3 + 0.5.0 us.codecraft webmagic-extension - 0.4.3 + 0.5.0 ``` @@ -158,7 +158,7 @@ public class OschinaBlog { ### 详细文档 -见[webmagic manual.md](https://github.com/code4craft/webmagic/blob/master/user-manual.md)。 +见[http://webmagic.io/docs/](http://webmagic.io/docs/)。 ### 示例