From 862ceee6745b7a85367f9358d8031422e3d5bfe7 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Tue, 13 Jan 2015 17:18:27 +0800 Subject: [PATCH] groovy demo --- webmagic-scripts/pom.xml | 5 +++++ webmagic-scripts/src/main/groovy/Github.groovy | 18 ++++++++++++++++++ 2 files changed, 23 insertions(+) create mode 100644 webmagic-scripts/src/main/groovy/Github.groovy diff --git a/webmagic-scripts/pom.xml b/webmagic-scripts/pom.xml index d9a6fbe0..d9764920 100755 --- a/webmagic-scripts/pom.xml +++ b/webmagic-scripts/pom.xml @@ -16,6 +16,11 @@ jruby 1.7.6 + + org.codehaus.groovy + groovy-all + 2.1.6 + org.python jython 2.5.3 diff --git a/webmagic-scripts/src/main/groovy/Github.groovy b/webmagic-scripts/src/main/groovy/Github.groovy new file mode 100644 index 00000000..5764061e --- /dev/null +++ b/webmagic-scripts/src/main/groovy/Github.groovy @@ -0,0 +1,18 @@ +Github { + Site { + sleepTime 0 + timeOut 100 + retryTimes 3 + userAgent ['a','b','c'].random + } + match "https://github.com/\\w+/\\w+" { + addUrl(url.regex("https://github.com/\\w+/\\w+")) + return { + name: html.xpath("//h1[@class='entry-title public']/strong/a/text()") + author: html.xpath "https://github\\.com/(\\w+)/.*" + readme: html.xpath "//div[@id='readme']/tidyText()" + star : toInt(html.xpath("//div[@id='readme']/tidyText()")) + } + } + +}