diff --git a/README.md b/README.md index 7eee9e3d..1f4bc130 100644 --- a/README.md +++ b/README.md @@ -50,33 +50,35 @@ WebMagic use slf4j with slf4j-log4j12 implementation. If you customized your slf ### First crawler: -Write a class implements PageProcessor: +Write a class implements PageProcessor. For example, I wrote a crawler of github repository infomation. ```java -public class OschinaBlogPageProcesser implements PageProcessor { +public class GithubRepoPageProcessor implements PageProcessor { - private Site site = Site.me().setDomain("my.oschina.net"); + private Site site = Site.me().setRetryTimes(3).setSleepTime(1000); @Override public void process(Page page) { - List links = page.getHtml().links().regex("http://my\\.oschina\\.net/flashsword/blog/\\d+").all(); - page.addTargetRequests(links); - page.putField("title", page.getHtml().xpath("//div[@class='BlogEntity']/div[@class='BlogTitle']/h1").toString()); - page.putField("content", page.getHtml().$("div.content").toString()); - page.putField("tags",page.getHtml().xpath("//div[@class='BlogTags']/a/text()").all()); + page.addTargetRequests(page.getHtml().links().regex("(https://github\\.com/\\w+/\\w+)").all()); + page.putField("author", page.getUrl().regex("https://github\\.com/(\\w+)/.*").toString()); + page.putField("name", page.getHtml().xpath("//h1[@class='entry-title public']/strong/a/text()").toString()); + if (page.getResultItems().get("name")==null){ + //skip this page + page.setSkip(true); + } + page.putField("readme", page.getHtml().xpath("//div[@id='readme']/tidyText()")); } @Override public Site getSite() { return site; - } public static void main(String[] args) { - Spider.create(new OschinaBlogPageProcesser()).addUrl("http://my.oschina.net/flashsword/blog") - .addPipeline(new ConsolePipeline()).run(); + Spider.create(new GithubRepoPageProcessor()).addUrl("https://github.com/code4craft").thread(5).run(); } } + ``` * `page.addTargetRequests(links)` @@ -86,22 +88,23 @@ public class OschinaBlogPageProcesser implements PageProcessor { You can also use annotation way: ```java -@TargetUrl("http://my.oschina.net/flashsword/blog/\\d+") -public class OschinaBlog { +@TargetUrl("https://github.com/\\w+/\\w+") +@HelpUrl("https://github.com/\\w+") +public class GithubRepo { - @ExtractBy("//title") - private String title; + @ExtractBy(value = "//h1[@class='entry-title public']/strong/a/text()", notNull = true) + private String name; - @ExtractBy(value = "div.BlogContent",type = ExtractBy.Type.Css) - private String content; + @ExtractByUrl("https://github\\.com/(\\w+)/.*") + private String author; - @ExtractBy(value = "//div[@class='BlogTags']/a/text()", multi = true) - private List tags; + @ExtractBy("//div[@id='readme']/tidyText()") + private String readme; public static void main(String[] args) { - OOSpider.create( - Site.me(), - new ConsolePageModelPipeline(), OschinaBlog.class).addUrl("http://my.oschina.net/flashsword/blog").run(); + OOSpider.create(Site.me().setSleepTime(1000) + , new ConsolePageModelPipeline(), GithubRepo.class) + .addUrl("https://github.com/code4craft").thread(5).run(); } } ``` diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/processor/example/GithubRepoPageProcessor.java b/webmagic-core/src/main/java/us/codecraft/webmagic/processor/example/GithubRepoPageProcessor.java index 179bad43..c5122658 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/processor/example/GithubRepoPageProcessor.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/processor/example/GithubRepoPageProcessor.java @@ -11,7 +11,7 @@ import us.codecraft.webmagic.processor.PageProcessor; */ public class GithubRepoPageProcessor implements PageProcessor { - private Site site = Site.me().setRetryTimes(3).setSleepTime(100); + private Site site = Site.me().setRetryTimes(3).setSleepTime(1000); @Override public void process(Page page) { diff --git a/webmagic-extension/pom.xml b/webmagic-extension/pom.xml index 78f27570..cd8c12f4 100644 --- a/webmagic-extension/pom.xml +++ b/webmagic-extension/pom.xml @@ -23,12 +23,6 @@ us.codecraft webmagic-core ${project.version} - - - org.slf4j - slf4j-log4j12 - - junit