From fc68c0f7d86fbfb7188b7dff6d812e269e48ddcb Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Fri, 16 Aug 2013 12:22:00 +0800 Subject: [PATCH] add docs in en --- en_docs/README.md | 98 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 98 insertions(+) create mode 100644 en_docs/README.md diff --git a/en_docs/README.md b/en_docs/README.md new file mode 100644 index 00000000..c420f499 --- /dev/null +++ b/en_docs/README.md @@ -0,0 +1,98 @@ +webmagic +--- +[![Build Status](https://travis-ci.org/code4craft/webmagic.png?branch=master)](https://travis-ci.org/code4craft/webmagic) + +>A crawler framework. + +## Features: + +* Simple core with high flexibility. +* Simple API for html extracting. +* Multi-thread and Distributed support. +* Easy to be integrated. + +## Install: + +Clone the repo and build: + + git clone https://github.com/code4craft/webmagic.git + cd webmagic + mvn clean install + +Add dependencies to your project: + + + us.codecraft + webmagic-core + 0.2.0 + + + us.codecraft + webmagic-extension + 0.2.0 + + +## Get Started: + +### First crawler: + +Write a class implements PageProcessor: + + public class OschinaBlogPageProcesser implements PageProcessor { + + private Site site = Site.me().setDomain("my.oschina.net") + .addStartUrl("http://my.oschina.net/flashsword/blog"); + + @Override + public void process(Page page) { + List links = page.getHtml().links().regex("http://my\\.oschina\\.net/flashsword/blog/\\d+").all(); + page.addTargetRequests(links); + page.putField("title", page.getHtml().xpath("//div[@class='BlogEntity']/div[@class='BlogTitle']/h1").toString()); + page.putField("content", page.getHtml().$("div.content").toString()); + page.putField("tags",page.getHtml().xpath("//div[@class='BlogTags']/a/text()").all()); + } + + @Override + public Site getSite() { + return site; + + } + + public static void main(String[] args) { + Spider.create(new OschinaBlogPageProcesser()) + .pipeline(new ConsolePipeline()).run(); + } + } + +* `page.addTargetRequests(links)` + + Add urls for crawling. + +You can also use annotation way: + + @TargetUrl("http://my.oschina.net/flashsword/blog/\\d+") + public class OschinaBlog { + + @ExtractBy("//title") + private String title; + + @ExtractBy(value = "div.BlogContent",type = ExtractBy.Type.Css) + private String content; + + @ExtractBy(value = "//div[@class='BlogTags']/a/text()", multi = true) + private List tags; + + public static void main(String[] args) { + OOSpider.create( + Site.me().addStartUrl("http://my.oschina.net/flashsword/blog"), + new ConsolePageModelPipeline(), OschinaBlog.class).run(); + } + } + +### Samples: + +There are some samples in `webmagic-samples` package. + +### Lisence: + +Lisenced under [Apache 2.0 lisence](http://opensource.org/licenses/Apache-2.0) \ No newline at end of file