diff --git a/webmagic-scripts/pom.xml b/webmagic-scripts/pom.xml index d9a6fbe0..d9764920 100755 --- a/webmagic-scripts/pom.xml +++ b/webmagic-scripts/pom.xml @@ -16,6 +16,11 @@ jruby 1.7.6 + + org.codehaus.groovy + groovy-all + 2.1.6 + org.python jython 2.5.3 diff --git a/webmagic-scripts/src/main/groovy/Github.groovy b/webmagic-scripts/src/main/groovy/Github.groovy new file mode 100644 index 00000000..5764061e --- /dev/null +++ b/webmagic-scripts/src/main/groovy/Github.groovy @@ -0,0 +1,18 @@ +Github { + Site { + sleepTime 0 + timeOut 100 + retryTimes 3 + userAgent ['a','b','c'].random + } + match "https://github.com/\\w+/\\w+" { + addUrl(url.regex("https://github.com/\\w+/\\w+")) + return { + name: html.xpath("//h1[@class='entry-title public']/strong/a/text()") + author: html.xpath "https://github\\.com/(\\w+)/.*" + readme: html.xpath "//div[@id='readme']/tidyText()" + star : toInt(html.xpath("//div[@id='readme']/tidyText()")) + } + } + +}