From d1fc1cf305d51af303b0740f2e3b10a853b5e364 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Sat, 8 Jun 2013 20:20:30 +0800 Subject: [PATCH] add diaoyuwen --- .../java/us/codecraft/spider/SpiderTest.java | 17 +++++----- .../spider/samples/DiaoyuwengProcessor.java | 33 +++++++++++++++++++ 2 files changed, 42 insertions(+), 8 deletions(-) create mode 100644 src/test/java/us/codecraft/spider/samples/DiaoyuwengProcessor.java diff --git a/src/test/java/us/codecraft/spider/SpiderTest.java b/src/test/java/us/codecraft/spider/SpiderTest.java index 83e8e8f9..5c08b849 100644 --- a/src/test/java/us/codecraft/spider/SpiderTest.java +++ b/src/test/java/us/codecraft/spider/SpiderTest.java @@ -3,9 +3,8 @@ package us.codecraft.spider; import org.junit.Ignore; import org.junit.Test; import us.codecraft.spider.pipeline.FilePipeline; -import us.codecraft.spider.processor.PageProcessor; +import us.codecraft.spider.processor.SimplePageProcessor; import us.codecraft.spider.samples.HuxiuProcessor; -import us.codecraft.spider.samples.MeicanProcessor; import us.codecraft.spider.schedular.FileCacheQueueSchedular; /** @@ -24,12 +23,14 @@ public class SpiderTest { @Test public void testGlobalSpider(){ - PageProcessor pageProcessor = new MeicanProcessor(); - Spider.me().pipeline(new FilePipeline()).schedular(new FileCacheQueueSchedular(pageProcessor.getSite(),"/data/temp/spider/cache/")). - processor(pageProcessor).run(); -// SimplePageProcessor pageProcessor2 = new SimplePageProcessor("http://lol.duowan.com/", "http://lol.duowan.com/*.html"); -// Spider.me().pipeline(new FilePipeline()).schedular(new FileCacheQueueSchedular(pageProcessor2.getSite(),"/data/temp/spider/cache/")). -// processor(pageProcessor2).run(); +// PageProcessor pageProcessor = new MeicanProcessor(); +// Spider.me().pipeline(new FilePipeline()).schedular(new FileCacheQueueSchedular(pageProcessor.getSite(),"/data/temp/spider/cache/")). +// processor(pageProcessor).run(); + SimplePageProcessor pageProcessor2 = new SimplePageProcessor("http://www.diaoyuweng.com/home.php?mod=space&uid=88304&do=thread&view=me&type=thread&from=space", "http://www.diaoyuweng.com/thread-*-1-1.html"); + pageProcessor2.getSite().setEncoding("GBK"); + System.out.println(pageProcessor2.getSite().getEncoding()); + Spider.me().pipeline(new FilePipeline()).schedular(new FileCacheQueueSchedular(pageProcessor2.getSite(),"/data/temp/spider/cache/")). + processor(pageProcessor2).run(); } diff --git a/src/test/java/us/codecraft/spider/samples/DiaoyuwengProcessor.java b/src/test/java/us/codecraft/spider/samples/DiaoyuwengProcessor.java new file mode 100644 index 00000000..586ec01a --- /dev/null +++ b/src/test/java/us/codecraft/spider/samples/DiaoyuwengProcessor.java @@ -0,0 +1,33 @@ +package us.codecraft.spider.samples; + +import us.codecraft.spider.Page; +import us.codecraft.spider.Site; +import us.codecraft.spider.processor.PageProcessor; + +import java.util.List; + +/** + * User: cairne + * Date: 13-4-21 + * Time: 下午8:08 + */ +public class DiaoyuwengProcessor implements PageProcessor { + @Override + public void process(Page page) { + //http://progressdaily.diandian.com/post/2013-01-24/40046867275 + List requests = page.getHtml().rs("]*href=[\"']{1}(/shop/.*?)[\"']{1}").toStrings(); + page.addTargetRequests(requests); + requests = page.getHtml().rs("]*href=[\"']{1}(/search/category/.*?)[\"']{1}").toStrings(); + page.addTargetRequests(requests); + if (page.getUrl().toString().contains("shop")){ + page.putField("title", page.getHtml().x("//h1[@class='shop-title']")); + page.putField("content", page.getHtml().sc()); + } + } + + @Override + public Site getSite() { + return Site.me().setDomain("www.dianping.com").setStartUrl("http://www.dianping.com/"). + setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31"); + } +}