diff --git a/pom.xml b/pom.xml index e7290bc9..206d3c59 100644 --- a/pom.xml +++ b/pom.xml @@ -136,7 +136,7 @@ org.jsoup jsoup - 1.7.2 + 1.8.3 org.mockito diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/MockGithubDownloader.java b/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/MockGithubDownloader.java new file mode 100644 index 00000000..3aa742c1 --- /dev/null +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/MockGithubDownloader.java @@ -0,0 +1,34 @@ +package us.codecraft.webmagic.downloader; + +import org.apache.commons.io.IOUtils; +import us.codecraft.webmagic.Page; +import us.codecraft.webmagic.Request; +import us.codecraft.webmagic.Task; +import us.codecraft.webmagic.selector.PlainText; + +import java.io.IOException; +import java.io.InputStream; + +/** + * @author code4crafter@gmail.com + */ +public class MockGithubDownloader implements Downloader { + + @Override + public Page download(Request request, Task task) { + Page page = new Page(); + InputStream resourceAsStream = this.getClass().getResourceAsStream("/html/mock-github.html"); + try { + page.setRawText(IOUtils.toString(resourceAsStream)); + } catch (IOException e) { + e.printStackTrace(); + } + page.setRequest(new Request("https://github.com/code4craft/webmagic")); + page.setUrl(new PlainText("https://github.com/code4craft/webmagic")); + return page; + } + + @Override + public void setThread(int threadNum) { + } +} diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/example/GithubRepoPageProcessorTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/example/GithubRepoPageProcessorTest.java new file mode 100644 index 00000000..c9e3548e --- /dev/null +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/example/GithubRepoPageProcessorTest.java @@ -0,0 +1,30 @@ +package us.codecraft.webmagic.example; + +import org.junit.Test; +import us.codecraft.webmagic.ResultItems; +import us.codecraft.webmagic.Spider; +import us.codecraft.webmagic.Task; +import us.codecraft.webmagic.downloader.MockGithubDownloader; +import us.codecraft.webmagic.pipeline.Pipeline; +import us.codecraft.webmagic.processor.example.GithubRepoPageProcessor; + +import static org.assertj.core.api.Assertions.assertThat; + +/** + * @author code4crafter@gmail.com + * Date: 16/1/19 + * Time: 上午7:27 + */ +public class GithubRepoPageProcessorTest { + + @Test + public void test_github() throws Exception { + Spider.create(new GithubRepoPageProcessor()).addPipeline(new Pipeline() { + @Override + public void process(ResultItems resultItems, Task task) { + assertThat(((String) resultItems.get("name")).trim()).isEqualTo("webmagic"); + assertThat(((String) resultItems.get("author")).trim()).isEqualTo("code4craft"); + } + }).setDownloader(new MockGithubDownloader()).test("https://github.com/code4craft/webmagic"); + } +} diff --git a/webmagic-core/src/test/resources/html/mock-github.html b/webmagic-core/src/test/resources/html/mock-github.html new file mode 100644 index 00000000..df53d870 --- /dev/null +++ b/webmagic-core/src/test/resources/html/mock-github.html @@ -0,0 +1,1580 @@ + + + + + + + + + + + + + + + code4craft/webmagic + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Skip to content + + + + + + + + + + + + + + +
+ +
+
+ + +
+
+
+ +
+
+ + + +
    + +
  • +
    + +
    + + + + Unwatch + + + + +
    + +
    +
    +
    +
  • + +
  • + +
    + +
    + + +
    +
    + + +
    + +
  • + +
  • + + + Fork + + + + + +
  • +
+ +

+ + /webmagic + + + + + +

+ +
+ +
+ +
+
+ + +
+ + A scalable web crawler framework. + http://webmagic.io/ + + + Edit +
+ +
+ + +
+ +
+ + +
+ + + or Cancel +
+ + + + +
+ Java + CSS + JavaScript + FreeMarker + HTML + Ruby +
+ + + +
+
+
+ +
+ +
+ + + + Find file + +
+
+ +
+
+ + +
+
+
+ +
+ +
+ +
+ +
+ +
+ +
+
+
+ +
+ + + +
+ + + +
+
+ + + + + + + New pull request + + + +
+ + + + +
+ + Latest commit + + 800f66c + + + + + + + @code4craft + + + + Revert "remove some unkown config" + + + + + +
This reverts commit 0e245c9.
+
+ + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Failed to load latest commit information.
+ + + + assets + + + 同步官方源码 + + + +
+ + + + en_docs + + + docs + + + +
+ + + + webmagic-avalon + + + update version to snapshot + + + +
+ + + + webmagic-core + + + 修正FileCacheQueueScheduler导致程序不能正常结束和未关闭流 + + + +
+ + + + webmagic-extension + + + Merge pull request #237 from SpenceZhou/master + + + +
+ + + + webmagic-samples + + + Merge pull request #227 from hsqlu/master + + + +
+ + + + webmagic-saxon + + + update version + + + +
+ + + + webmagic-scripts + + + update version + + + +
+ + + + webmagic-selenium + + + update and validate pom.xml + + + +
+ + + + zh_docs + + + contributor + + + +
+ + + + .gitignore + + + change_gitignore + + + +
+ + + + .travis.yml + + + remove ci for jdk6 + + + +
+ + + + README.md + + + contributor + + + +
+ + + + pom.xml + + + Revert "remove some unkown config" + + + +
+ + + + release-note.md + + + #34 Close reader in FileCacheQueueScheduler + + + +
+ + + + user-manual.md + + + deperate in user manual + + + +
+ + + + webmagic-avalon.md + + + scripts readme + + + +
+ +
+ + + +
+

+ + README.md +

+ +

logo

+ +

Readme in Chinese

+ +

User Manual (Chinese)

+ +

Build Status

+ +
+

A scalable crawler framework. It covers the whole lifecycle of crawler: downloading, url management, content extraction and persistent. It can simplify the development of a specific crawler.

+
+ +

Features:

+ +
    +
  • Simple core with high flexibility.
  • +
  • Simple API for html extracting.
  • +
  • Annotation with POJO to customize a crawler, no configuration.
  • +
  • Multi-thread and Distribution support.
  • +
  • Easy to be integrated.
  • +
+ +

Install:

+ +

Add dependencies to your pom.xml:

+ +
<dependency>
+    <groupId>us.codecraft</groupId>
+    <artifactId>webmagic-core</artifactId>
+    <version>0.5.2</version>
+</dependency>
+<dependency>
+    <groupId>us.codecraft</groupId>
+    <artifactId>webmagic-extension</artifactId>
+    <version>0.5.2</version>
+</dependency>
+ +

WebMagic use slf4j with slf4j-log4j12 implementation. If you customized your slf4j implementation, please exclude slf4j-log4j12.

+ +
<exclusions>
+    <exclusion>
+        <groupId>org.slf4j</groupId>
+        <artifactId>slf4j-log4j12</artifactId>
+    </exclusion>
+</exclusions>
+ +

Get Started:

+ +

First crawler:

+ +

Write a class implements PageProcessor. For example, I wrote a crawler of github repository infomation.

+ +
public class GithubRepoPageProcessor implements PageProcessor {
+
+    private Site site = Site.me().setRetryTimes(3).setSleepTime(1000);
+
+    @Override
+    public void process(Page page) {
+        page.addTargetRequests(page.getHtml().links().regex("(https://github\\.com/\\w+/\\w+)").all());
+        page.putField("author", page.getUrl().regex("https://github\\.com/(\\w+)/.*").toString());
+        page.putField("name", page.getHtml().xpath("//h1[@class='entry-title public']/strong/a/text()").toString());
+        if (page.getResultItems().get("name")==null){
+            //skip this page
+            page.setSkip(true);
+        }
+        page.putField("readme", page.getHtml().xpath("//div[@id='readme']/tidyText()"));
+    }
+
+    @Override
+    public Site getSite() {
+        return site;
+    }
+
+    public static void main(String[] args) {
+        Spider.create(new GithubRepoPageProcessor()).addUrl("https://github.com/code4craft").thread(5).run();
+    }
+}
+ +
    +
  • page.addTargetRequests(links)

    + +

    Add urls for crawling.

  • +
+ +

You can also use annotation way:

+ +
@TargetUrl("https://github.com/\\w+/\\w+")
+@HelpUrl("https://github.com/\\w+")
+public class GithubRepo {
+
+    @ExtractBy(value = "//h1[@class='entry-title public']/strong/a/text()", notNull = true)
+    private String name;
+
+    @ExtractByUrl("https://github\\.com/(\\w+)/.*")
+    private String author;
+
+    @ExtractBy("//div[@id='readme']/tidyText()")
+    private String readme;
+
+    public static void main(String[] args) {
+        OOSpider.create(Site.me().setSleepTime(1000)
+                , new ConsolePageModelPipeline(), GithubRepo.class)
+                .addUrl("https://github.com/code4craft").thread(5).run();
+    }
+}
+ +

Docs and samples:

+ +

Documents: http://webmagic.io/docs/

+ +

The architecture of webmagic (refered to Scrapy)

+ +

image

+ +

Javadocs: http://code4craft.github.io/webmagic/docs/en/

+ +

There are some samples in webmagic-samples package.

+ +

Lisence:

+ +

Lisenced under Apache 2.0 lisence

+ +

Contributors:

+ +

Thanks these people for commiting source code, reporting bugs or suggesting for new feature:

+ + + +

Thanks:

+ +

To write webmagic, I refered to the projects below :

+ + + +

Mail-list:

+ +

https://groups.google.com/forum/#!forum/webmagic-java

+ +

http://list.qq.com/cgi-bin/qf_invite?id=023a01f505246785f77c5a5a9aff4e57ab20fcdde871e988

+ +

QQ Group: 373225642

+ +

Bitdeli Badge

+
+
+ + +
+ +
+ +
+
+ +
+ +
+ +
+ + + + + + + +
+ + + Something went wrong with that request. Please try again. +
+ + + + + + + + + + + + + diff --git a/webmagic-extension/src/test/resources/html/mock-github.html b/webmagic-extension/src/test/resources/html/mock-github.html new file mode 100644 index 00000000..df53d870 --- /dev/null +++ b/webmagic-extension/src/test/resources/html/mock-github.html @@ -0,0 +1,1580 @@ + + + + + + + + + + + + + + + code4craft/webmagic + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Skip to content + + + + + + + + + + + + + + +
+ +
+
+ + +
+
+
+ +
+
+ + + +
    + +
  • +
    + +
    + + + + Unwatch + + + + +
    + +
    +
    +
    +
  • + +
  • + +
    + +
    + + +
    +
    + + +
    + +
  • + +
  • + + + Fork + + + + + +
  • +
+ +

+ + /webmagic + + + + + +

+ +
+ +
+ +
+
+ + +
+ + A scalable web crawler framework. + http://webmagic.io/ + + + Edit +
+ +
+ + +
+ +
+ + +
+ + + or Cancel +
+ + + + +
+ Java + CSS + JavaScript + FreeMarker + HTML + Ruby +
+ + + +
+
+
+ +
+ +
+ + + + Find file + +
+
+ +
+
+ + +
+
+
+ +
+ +
+ +
+ +
+ +
+ +
+
+
+ +
+ + + +
+ + + +
+
+ + + + + + + New pull request + + + +
+ + + + +
+ + Latest commit + + 800f66c + + + + + + + @code4craft + + + + Revert "remove some unkown config" + + + + + +
This reverts commit 0e245c9.
+
+ + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Failed to load latest commit information.
+ + + + assets + + + 同步官方源码 + + + +
+ + + + en_docs + + + docs + + + +
+ + + + webmagic-avalon + + + update version to snapshot + + + +
+ + + + webmagic-core + + + 修正FileCacheQueueScheduler导致程序不能正常结束和未关闭流 + + + +
+ + + + webmagic-extension + + + Merge pull request #237 from SpenceZhou/master + + + +
+ + + + webmagic-samples + + + Merge pull request #227 from hsqlu/master + + + +
+ + + + webmagic-saxon + + + update version + + + +
+ + + + webmagic-scripts + + + update version + + + +
+ + + + webmagic-selenium + + + update and validate pom.xml + + + +
+ + + + zh_docs + + + contributor + + + +
+ + + + .gitignore + + + change_gitignore + + + +
+ + + + .travis.yml + + + remove ci for jdk6 + + + +
+ + + + README.md + + + contributor + + + +
+ + + + pom.xml + + + Revert "remove some unkown config" + + + +
+ + + + release-note.md + + + #34 Close reader in FileCacheQueueScheduler + + + +
+ + + + user-manual.md + + + deperate in user manual + + + +
+ + + + webmagic-avalon.md + + + scripts readme + + + +
+ +
+ + + +
+

+ + README.md +

+ +

logo

+ +

Readme in Chinese

+ +

User Manual (Chinese)

+ +

Build Status

+ +
+

A scalable crawler framework. It covers the whole lifecycle of crawler: downloading, url management, content extraction and persistent. It can simplify the development of a specific crawler.

+
+ +

Features:

+ +
    +
  • Simple core with high flexibility.
  • +
  • Simple API for html extracting.
  • +
  • Annotation with POJO to customize a crawler, no configuration.
  • +
  • Multi-thread and Distribution support.
  • +
  • Easy to be integrated.
  • +
+ +

Install:

+ +

Add dependencies to your pom.xml:

+ +
<dependency>
+    <groupId>us.codecraft</groupId>
+    <artifactId>webmagic-core</artifactId>
+    <version>0.5.2</version>
+</dependency>
+<dependency>
+    <groupId>us.codecraft</groupId>
+    <artifactId>webmagic-extension</artifactId>
+    <version>0.5.2</version>
+</dependency>
+ +

WebMagic use slf4j with slf4j-log4j12 implementation. If you customized your slf4j implementation, please exclude slf4j-log4j12.

+ +
<exclusions>
+    <exclusion>
+        <groupId>org.slf4j</groupId>
+        <artifactId>slf4j-log4j12</artifactId>
+    </exclusion>
+</exclusions>
+ +

Get Started:

+ +

First crawler:

+ +

Write a class implements PageProcessor. For example, I wrote a crawler of github repository infomation.

+ +
public class GithubRepoPageProcessor implements PageProcessor {
+
+    private Site site = Site.me().setRetryTimes(3).setSleepTime(1000);
+
+    @Override
+    public void process(Page page) {
+        page.addTargetRequests(page.getHtml().links().regex("(https://github\\.com/\\w+/\\w+)").all());
+        page.putField("author", page.getUrl().regex("https://github\\.com/(\\w+)/.*").toString());
+        page.putField("name", page.getHtml().xpath("//h1[@class='entry-title public']/strong/a/text()").toString());
+        if (page.getResultItems().get("name")==null){
+            //skip this page
+            page.setSkip(true);
+        }
+        page.putField("readme", page.getHtml().xpath("//div[@id='readme']/tidyText()"));
+    }
+
+    @Override
+    public Site getSite() {
+        return site;
+    }
+
+    public static void main(String[] args) {
+        Spider.create(new GithubRepoPageProcessor()).addUrl("https://github.com/code4craft").thread(5).run();
+    }
+}
+ +
    +
  • page.addTargetRequests(links)

    + +

    Add urls for crawling.

  • +
+ +

You can also use annotation way:

+ +
@TargetUrl("https://github.com/\\w+/\\w+")
+@HelpUrl("https://github.com/\\w+")
+public class GithubRepo {
+
+    @ExtractBy(value = "//h1[@class='entry-title public']/strong/a/text()", notNull = true)
+    private String name;
+
+    @ExtractByUrl("https://github\\.com/(\\w+)/.*")
+    private String author;
+
+    @ExtractBy("//div[@id='readme']/tidyText()")
+    private String readme;
+
+    public static void main(String[] args) {
+        OOSpider.create(Site.me().setSleepTime(1000)
+                , new ConsolePageModelPipeline(), GithubRepo.class)
+                .addUrl("https://github.com/code4craft").thread(5).run();
+    }
+}
+ +

Docs and samples:

+ +

Documents: http://webmagic.io/docs/

+ +

The architecture of webmagic (refered to Scrapy)

+ +

image

+ +

Javadocs: http://code4craft.github.io/webmagic/docs/en/

+ +

There are some samples in webmagic-samples package.

+ +

Lisence:

+ +

Lisenced under Apache 2.0 lisence

+ +

Contributors:

+ +

Thanks these people for commiting source code, reporting bugs or suggesting for new feature:

+ + + +

Thanks:

+ +

To write webmagic, I refered to the projects below :

+ + + +

Mail-list:

+ +

https://groups.google.com/forum/#!forum/webmagic-java

+ +

http://list.qq.com/cgi-bin/qf_invite?id=023a01f505246785f77c5a5a9aff4e57ab20fcdde871e988

+ +

QQ Group: 373225642

+ +

Bitdeli Badge

+
+
+ + +
+ +
+ +
+
+ +
+ +
+ +
+ + + + + + + +
+ + + Something went wrong with that request. Please try again. +
+ + + + + + + + + + + + +