From 633e0fe8347872bbd07db24e35a1df2a2efc20e6 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Thu, 28 Nov 2013 11:39:19 +0800 Subject: [PATCH] document for avalon --- webmagic-avalon.md | 24 ++++++++++ webmagic-scripts/README.md | 47 +++++++++++++++++++ .../src/main/resources/js/github.js | 14 ++++++ .../src/main/resources/js/oschina.js | 2 +- .../src/main/resources/ruby/github.rb | 10 ++++ 5 files changed, 96 insertions(+), 1 deletion(-) create mode 100644 webmagic-avalon.md create mode 100644 webmagic-scripts/README.md create mode 100644 webmagic-scripts/src/main/resources/js/github.js create mode 100644 webmagic-scripts/src/main/resources/ruby/github.rb diff --git a/webmagic-avalon.md b/webmagic-avalon.md new file mode 100644 index 00000000..975efede --- /dev/null +++ b/webmagic-avalon.md @@ -0,0 +1,24 @@ +WebMagic-Avalon项目手册 +======= +WebMagic-Avalon项目的目标是打造一个可配置、可管理的爬虫,以及一个可分享配置/脚本的平台,从而减少熟悉的开发者的开发量,并且让**不熟悉Java技术的人**也能简单的使用一个爬虫。 + +## Part1:webmagic-scripts + +目标:使得可以用简单脚本的方式编写爬虫,从而为一些常用场景提供可流通的脚本。 +例如:我需要抓github的仓库数据,可以这样写一个脚本(javascript): + +[https://github.com/code4craft/webmagic/tree/master/webmagic-scripts](https://github.com/code4craft/webmagic/tree/master/webmagic-scripts) + +这个功能目前实现了一部分,但最终结果仍在实验阶段。欢迎大家积极参与并提出意见。 + +## Part2:webmagic-pannel + +一个集成了加载脚本、管理爬虫的后台。计划中。 + +## Part3:webmagic-market + +一个可以分享、搜索和下载脚本的站点。计划中。 + +## 如何参与 + +webmagic目前 \ No newline at end of file diff --git a/webmagic-scripts/README.md b/webmagic-scripts/README.md new file mode 100644 index 00000000..8077bf94 --- /dev/null +++ b/webmagic-scripts/README.md @@ -0,0 +1,47 @@ +webmagic-scripts +====== +## 目标: +使得可以用简单脚本的方式编写爬虫,从而为一些常用场景提供可流通的脚本。 + +## 实例: +例如:我需要抓github的仓库数据,可以这样写一个脚本(javascript): + +```javascript +var name=xpath("//h1[@class='entry-title public']/strong/a/text()") +var readme=xpath("//div[@id='readme']/tidyText()") +var star=xpath("//ul[@class='pagehead-actions']/li[1]//a[@class='social-count js-social-count']/text()") +var fork=xpath("//ul[@class='pagehead-actions']/li[2]//a[@class='social-count']/text()") +var url=page.getUrl().toString() +if (name!=null){ + println(name) + println(readme) + println(star) + println(url) +} + +urls("(https://github\\.com/\\w+/\\w+)") +urls("(https://github\\.com/\\w+)") +``` + +然后使用webmagic加载并启动它,无需下载依赖、编写代码、执行的过程。 + +如果已经有人写好了脚本,那么你直接使用就可以了! + +## 语言: + +选用javascript是因为用户面比较广。目前还支持ruby语言,选用ruby是因为ruby的语法编写DSL更简洁: + +```ruby +name= xpath "//h1[@class='entry-title public']/strong/a/text()" +readme = xpath "//div[@id='readme']/tidyText()" +star = xpath "//ul[@class='pagehead-actions']/li[1]//a[@class='social-count js-social-count']/text()" +fork = xpath "//ul[@class='pagehead-actions']/li[2]//a[@class='social-count']/text()" +url=$page.getUrl().toString() + +puts name,readme,star,fork,url unless name==nil + +urls "(https://github\\.com/\\w+/\\w+)" +urls "(https://github\\.com/\\w+)" +``` + +这个功能目前仍在实验阶段。欢迎大家积极参与并提出意见。 \ No newline at end of file diff --git a/webmagic-scripts/src/main/resources/js/github.js b/webmagic-scripts/src/main/resources/js/github.js new file mode 100644 index 00000000..d5e40c45 --- /dev/null +++ b/webmagic-scripts/src/main/resources/js/github.js @@ -0,0 +1,14 @@ +var name=xpath("//h1[@class='entry-title public']/strong/a/text()") +var readme=xpath("//div[@id='readme']/tidyText()") +var star=xpath("//ul[@class='pagehead-actions']/li[1]//a[@class='social-count js-social-count']/text()") +var fork=xpath("//ul[@class='pagehead-actions']/li[2]//a[@class='social-count']/text()") +var url=page.getUrl().toString() +if (name!=null){ + println(name) + println(readme) + println(star) + println(url) +} + +urls("(https://github\\.com/\\w+/\\w+)") +urls("(https://github\\.com/\\w+)") \ No newline at end of file diff --git a/webmagic-scripts/src/main/resources/js/oschina.js b/webmagic-scripts/src/main/resources/js/oschina.js index 0a11ade3..305682ea 100644 --- a/webmagic-scripts/src/main/resources/js/oschina.js +++ b/webmagic-scripts/src/main/resources/js/oschina.js @@ -8,4 +8,4 @@ var config = { } title = $("div.BlogTitle h1"), content = $("div.BlogContent") -urls("http://my\\.oschina\\.net/flashsword/blog/\\d+") \ No newline at end of file +urls("http://my\\.oschina\\.net/flashsword/blog/\\d+") diff --git a/webmagic-scripts/src/main/resources/ruby/github.rb b/webmagic-scripts/src/main/resources/ruby/github.rb new file mode 100644 index 00000000..3248bcc6 --- /dev/null +++ b/webmagic-scripts/src/main/resources/ruby/github.rb @@ -0,0 +1,10 @@ +name= xpath "//h1[@class='entry-title public']/strong/a/text()" +readme = xpath "//div[@id='readme']/tidyText()" +star = xpath "//ul[@class='pagehead-actions']/li[1]//a[@class='social-count js-social-count']/text()" +fork = xpath "//ul[@class='pagehead-actions']/li[2]//a[@class='social-count']/text()" +url=$page.getUrl().toString() + +puts name,readme,star,fork,url unless name==nil + +urls "(https://github\\.com/\\w+/\\w+)" +urls "(https://github\\.com/\\w+)" \ No newline at end of file