diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/SinaBlogProcesser.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/SinaBlogProcesser.java deleted file mode 100644 index dcb6eff9..00000000 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/SinaBlogProcesser.java +++ /dev/null @@ -1,37 +0,0 @@ -package us.codecraft.webmagic.samples; - -import us.codecraft.webmagic.Page; -import us.codecraft.webmagic.Site; -import us.codecraft.webmagic.Spider; -import us.codecraft.webmagic.processor.PageProcessor; - -/** - * @author code4crafter@gmail.com
- */ -public class SinaBlogProcesser implements PageProcessor { - - private Site site; - - @Override - public void process(Page page) { - page.addTargetRequests(page.getHtml().xpath("//div[@class='articalfrontback SG_j_linedot1 clearfix']").links().all()); - page.putField("title", page.getHtml().xpath("//div[@class='articalTitle']/h2")); - page.putField("content",page.getHtml().xpath("//div[@id='articlebody']//div[@class='articalContent']")); - page.putField("id",page.getUrl().regex("http://blog\\.sina\\.com\\.cn/s/blog_(\\w+)")); - page.putField("date",page.getHtml().xpath("//div[@id='articlebody']//span[@class='time SG_txtc']").regex("\\((.*)\\)")); -// page.putField("tags",page.getHtml().xpath("//td[@class='blog_tag']/h3/a")); - } - - @Override - public Site getSite() { - if (site==null){ - site = Site.me().setDomain("blog.sina.com.cn").addStartUrl("http://blog.sina.com.cn/s/blog_4701280b0102egl0.html").setSleepTime(3000). - setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31"); - } - return site; - } - - public static void main(String[] args) { - Spider.create(new SinaBlogProcesser()).run(); - } -} diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/SinaBlogProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/SinaBlogProcessor.java new file mode 100644 index 00000000..2872e02b --- /dev/null +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/SinaBlogProcessor.java @@ -0,0 +1,48 @@ +package us.codecraft.webmagic.samples; + +import us.codecraft.webmagic.Page; +import us.codecraft.webmagic.Site; +import us.codecraft.webmagic.Spider; +import us.codecraft.webmagic.processor.PageProcessor; + +/** + * @author code4crafter@gmail.com
+ */ +public class SinaBlogProcessor implements PageProcessor { + + public static final String URL_LIST = "http://blog\\.sina\\.com\\.cn/s/articlelist_1487828712_0_\\d+\\.html"; + + public static final String URL_POST = "http://blog\\.sina\\.com\\.cn/s/blog_\\w+\\.html"; + + private Site site = Site + .me() + .setDomain("blog.sina.com.cn") + .setSleepTime(3000) + .setUserAgent( + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31"); + + @Override + public void process(Page page) { + //列表页 + if (page.getUrl().regex(URL_LIST).match()) { + page.addTargetRequests(page.getHtml().xpath("//div[@class=\"articleList\"]").links().regex(URL_POST).all()); + page.addTargetRequests(page.getHtml().links().regex(URL_LIST).all()); + //文章页 + } else { + page.putField("title", page.getHtml().xpath("//div[@class='articalTitle']/h2")); + page.putField("content", page.getHtml().xpath("//div[@id='articlebody']//div[@class='articalContent']")); + page.putField("date", + page.getHtml().xpath("//div[@id='articlebody']//span[@class='time SG_txtc']").regex("\\((.*)\\)")); + } + } + + @Override + public Site getSite() { + return site; + } + + public static void main(String[] args) { + Spider.create(new SinaBlogProcessor()).addUrl("http://blog.sina.com.cn/s/articlelist_1487828712_0_1.html") + .run(); + } +}