diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/GithubRepo.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/GithubRepo.java index 1a2d8894..0aecb7bf 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/GithubRepo.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/GithubRepo.java @@ -4,4 +4,34 @@ package us.codecraft.webmagic.samples; * @author code4crafer@gmail.com */ public class GithubRepo { -} + + private String name; + + private String author; + + private String readme; + + public String getName() { + return name; + } + + public void setName(String name) { + this.name = name; + } + + public String getAuthor() { + return author; + } + + public void setAuthor(String author) { + this.author = author; + } + + public String getReadme() { + return readme; + } + + public void setReadme(String readme) { + this.readme = readme; + } +} \ No newline at end of file diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/GithubRepoPageProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/GithubRepoPageProcessor.java index db498a84..0de61fb6 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/GithubRepoPageProcessor.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/GithubRepoPageProcessor.java @@ -7,7 +7,7 @@ import us.codecraft.webmagic.processor.PageProcessor; /** * @author code4crafter@gmail.com
- * @since 0.3.2 + * @since 0.5.1 */ public class GithubRepoPageProcessor implements PageProcessor { @@ -17,13 +17,16 @@ public class GithubRepoPageProcessor implements PageProcessor { public void process(Page page) { page.addTargetRequests(page.getHtml().links().regex("(https://github\\.com/\\w+/\\w+)").all()); page.addTargetRequests(page.getHtml().links().regex("(https://github\\.com/\\w+)").all()); - page.putField("author", page.getUrl().regex("https://github\\.com/(\\w+)/.*").toString()); - page.putField("name", page.getHtml().xpath("//h1[@class='entry-title public']/strong/a/text()").toString()); - if (page.getResultItems().get("name")==null){ + GithubRepo githubRepo = new GithubRepo(); + githubRepo.setAuthor(page.getUrl().regex("https://github\\.com/(\\w+)/.*").toString()); + githubRepo.setName(page.getHtml().xpath("//h1[@class='entry-title public']/strong/a/text()").toString()); + githubRepo.setReadme(page.getHtml().xpath("//div[@id='readme']/tidyText()").toString()); + if (githubRepo.getName() == null) { //skip this page page.setSkip(true); + } else { + page.putField("repo", githubRepo); } - page.putField("readme", page.getHtml().xpath("//div[@id='readme']/tidyText()")); } @Override diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/pipeline/ReplacePipeline.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/pipeline/ReplacePipeline.java new file mode 100644 index 00000000..2458c8a7 --- /dev/null +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/pipeline/ReplacePipeline.java @@ -0,0 +1,7 @@ +package us.codecraft.webmagic.samples.pipeline; + +/** + * @author code4crafer@gmail.com + */ +public class ReplacePipeline { +}