From 98163a3e40cc94dbc8c6d5102374aca26137e8d9 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Sun, 18 Dec 2016 07:46:18 +0800 Subject: [PATCH] update examples --- .../example/BaiduBaikePageProcessor.java | 4 +- .../example/GithubRepoPageProcessor.java | 2 +- .../example/OschinaBlogPageProcessor.java | 39 ------------------- .../processor/example/ZhihuPageProcessor.java | 36 +++++++++++++++++ 4 files changed, 39 insertions(+), 42 deletions(-) delete mode 100644 webmagic-core/src/main/java/us/codecraft/webmagic/processor/example/OschinaBlogPageProcessor.java create mode 100644 webmagic-core/src/main/java/us/codecraft/webmagic/processor/example/ZhihuPageProcessor.java diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/processor/example/BaiduBaikePageProcessor.java b/webmagic-core/src/main/java/us/codecraft/webmagic/processor/example/BaiduBaikePageProcessor.java index 1dc3352f..f6ad87e0 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/processor/example/BaiduBaikePageProcessor.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/processor/example/BaiduBaikePageProcessor.java @@ -20,8 +20,8 @@ public class BaiduBaikePageProcessor implements PageProcessor { @Override public void process(Page page) { - page.putField("name", page.getHtml().css("h1.title div.lemmaTitleH1","text").toString()); - page.putField("description", page.getHtml().xpath("//div[@id='lemmaContent-0']//div[@class='para']/allText()")); + page.putField("name", page.getHtml().css("dl.lemmaWgt-lemmaTitle h1","text").toString()); + page.putField("description", page.getHtml().xpath("//div[@class='lemma-summary']/allText()")); } @Override diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/processor/example/GithubRepoPageProcessor.java b/webmagic-core/src/main/java/us/codecraft/webmagic/processor/example/GithubRepoPageProcessor.java index 0ca3b7f4..955bd5a3 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/processor/example/GithubRepoPageProcessor.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/processor/example/GithubRepoPageProcessor.java @@ -11,7 +11,7 @@ import us.codecraft.webmagic.processor.PageProcessor; */ public class GithubRepoPageProcessor implements PageProcessor { - private Site site = Site.me().setRetryTimes(3).setSleepTime(1000); + private Site site = Site.me().setRetryTimes(3).setSleepTime(1000).setTimeOut(10000); @Override public void process(Page page) { diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/processor/example/OschinaBlogPageProcessor.java b/webmagic-core/src/main/java/us/codecraft/webmagic/processor/example/OschinaBlogPageProcessor.java deleted file mode 100644 index 053c155d..00000000 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/processor/example/OschinaBlogPageProcessor.java +++ /dev/null @@ -1,39 +0,0 @@ -package us.codecraft.webmagic.processor.example; - -import us.codecraft.webmagic.Page; -import us.codecraft.webmagic.Site; -import us.codecraft.webmagic.Spider; -import us.codecraft.webmagic.processor.PageProcessor; - -import java.util.List; - -/** - * @author code4crafter@gmail.com
- */ -public class OschinaBlogPageProcessor implements PageProcessor { - - private Site site = Site.me().setDomain("my.oschina.net"); - - @Override - public void process(Page page) { - List links = page.getHtml().links().regex("http://my\\.oschina\\.net/flashsword/blog/\\d+").all(); - page.addTargetRequests(links); - page.putField("title", page.getHtml().xpath("//div[@class='BlogEntity']/div[@class='BlogTitle']/h1/text()").toString()); - if (page.getResultItems().get("title") == null) { - //skip this page - page.setSkip(true); - } - page.putField("content", page.getHtml().smartContent().toString()); - page.putField("tags", page.getHtml().xpath("//div[@class='BlogTags']/a/text()").all()); - } - - @Override - public Site getSite() { - return site; - - } - - public static void main(String[] args) { - Spider.create(new OschinaBlogPageProcessor()).addUrl("http://my.oschina.net/flashsword/blog").run(); - } -} diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/processor/example/ZhihuPageProcessor.java b/webmagic-core/src/main/java/us/codecraft/webmagic/processor/example/ZhihuPageProcessor.java new file mode 100644 index 00000000..a2a17e8c --- /dev/null +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/processor/example/ZhihuPageProcessor.java @@ -0,0 +1,36 @@ +package us.codecraft.webmagic.processor.example; + +import us.codecraft.webmagic.Page; +import us.codecraft.webmagic.Site; +import us.codecraft.webmagic.Spider; +import us.codecraft.webmagic.processor.PageProcessor; + +/** + * @author code4crafter@gmail.com
+ * @since 0.6.0 + */ +public class ZhihuPageProcessor implements PageProcessor { + + private Site site = Site.me().setRetryTimes(3).setSleepTime(1000); + + @Override + public void process(Page page) { + page.addTargetRequests(page.getHtml().links().regex("https://www\\.zhihu\\.com/question/\\d+/answer/\\d+.*").all()); + page.putField("title", page.getHtml().xpath("//h2[@class='zm-item-title']/a/text()").toString()); + page.putField("question", page.getHtml().xpath("//div[@id='zh-question-detail']//tidyText()").toString()); + page.putField("answer", page.getHtml().xpath("//div[@id='zh-question-answer-wrap']//div[@class='zm-editable-content']/tidyText()").toString()); + if (page.getResultItems().get("title")==null){ + //skip this page + page.setSkip(true); + } + } + + @Override + public Site getSite() { + return site; + } + + public static void main(String[] args) { + Spider.create(new ZhihuPageProcessor()).addUrl("https://www.zhihu.com/explore").run(); + } +}