From 818a2b2408bd85bcab0dd966132be83b94b0bfda Mon Sep 17 00:00:00 2001 From: "yihua.huang" <code4crafter@gmail.com> Date: Thu, 1 Jun 2017 07:51:11 +0800 Subject: [PATCH] invite kotlin experimental --- webmagic-scripts/pom.xml | 10 ++++++ webmagic-scripts/src/main/kotlin/Github.kt | 40 ++++++++++++++++++++++ 2 files changed, 50 insertions(+) create mode 100644 webmagic-scripts/src/main/kotlin/Github.kt diff --git a/webmagic-scripts/pom.xml b/webmagic-scripts/pom.xml index b126d9f8..404a6dd8 100755 --- a/webmagic-scripts/pom.xml +++ b/webmagic-scripts/pom.xml @@ -9,6 +9,9 @@ <groupId>us.codecraft</groupId> <artifactId>webmagic-scripts</artifactId> + <properties> + <kotlin.version>1.1.2-2</kotlin.version> + </properties> <dependencies> <dependency> @@ -16,6 +19,12 @@ <artifactId>jruby</artifactId> <version>1.7.6</version> </dependency> + <dependency> + <groupId>org.jetbrains.kotlin</groupId> + <artifactId>kotlin-stdlib</artifactId> + <version>${kotlin.version}</version> + </dependency> + <dependency> <groupId>org.codehaus.groovy</groupId> <artifactId>groovy-all</artifactId> @@ -48,6 +57,7 @@ </dependencies> <build> + <sourceDirectory>${project.basedir}/src/main/kotlin,${project.basedir}/src/main/groovy,${project.basedir}/src/main/java</sourceDirectory> <plugins> <plugin> <artifactId>maven-compiler-plugin</artifactId> diff --git a/webmagic-scripts/src/main/kotlin/Github.kt b/webmagic-scripts/src/main/kotlin/Github.kt new file mode 100644 index 00000000..3d6ca218 --- /dev/null +++ b/webmagic-scripts/src/main/kotlin/Github.kt @@ -0,0 +1,40 @@ + +import us.codecraft.webmagic.Page +import us.codecraft.webmagic.Site +import us.codecraft.webmagic.Spider +import us.codecraft.webmagic.processor.PageProcessor +import us.codecraft.webmagic.processor.example.GithubRepoPageProcessor + +/** + * + * @author code4crafter@gmail.com + * Date: 2017/5/31 + * Time: 下午11:33 + * + */ +class GithubRepoPageProcessor : PageProcessor { + + private val site = Site.me().setRetryTimes(3).setSleepTime(1000).setTimeOut(10000) + + override fun process(page: Page) { + page.addTargetRequests(page.html.links().regex("(https://github\\.com/[\\w\\-]+/[\\w\\-]+)").all()) + page.addTargetRequests(page.html.links().regex("(https://github\\.com/[\\w\\-])").all()) + page.putField("author", page.url.regex("https://github\\.com/(\\w+)/.*").toString()) + page.putField("name", page.html.xpath("//h1[@class='public']/strong/a/text()").toString()) + if (page.resultItems.get<Any>("name") == null) { + //skip this page + page.setSkip(true) + } + page.putField("readme", page.html.xpath("//div[@id='readme']/tidyText()")) + } + + override fun getSite(): Site { + return site + } + + companion object { + @JvmStatic fun main(args: Array<String>) { + Spider.create(GithubRepoPageProcessor()).addUrl("https://github.com/code4craft").thread(5).run() + } + } +}