diff --git a/webmagic-plugin/src/main/java/us/codecraft/webmagic/pipeline/FreemarkerPipeline.java b/webmagic-plugin/src/main/java/us/codecraft/webmagic/pipeline/FreemarkerPipeline.java index 32fec16e..9bdbed83 100644 --- a/webmagic-plugin/src/main/java/us/codecraft/webmagic/pipeline/FreemarkerPipeline.java +++ b/webmagic-plugin/src/main/java/us/codecraft/webmagic/pipeline/FreemarkerPipeline.java @@ -49,9 +49,7 @@ public class FreemarkerPipeline implements Pipeline { template.process(page.getFields(), printWriter); printWriter.close(); } catch (TemplateException e) { - e.printStackTrace(); //To change body of catch statement use File | Settings | File Templates. } catch (IOException e) { - e.printStackTrace(); //To change body of catch statement use File | Settings | File Templates. } } } diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DiandianBlogProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DiandianBlogProcessor.java index efd1ff7e..53b10520 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DiandianBlogProcessor.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DiandianBlogProcessor.java @@ -1,7 +1,7 @@ package us.codecraft.webmagic.samples; -import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Page; +import us.codecraft.webmagic.Site; import us.codecraft.webmagic.processor.PageProcessor; import java.util.List; @@ -12,18 +12,34 @@ import java.util.List; * Time: 下午8:08 */ public class DiandianBlogProcessor implements PageProcessor { + + private Site site; + @Override public void process(Page page) { - //http://progressdaily.diandian.com/post/2013-01-24/40046867275 - List requests = page.getHtml().rs("]*href=[\"']{1}(http://17dujingdian\\.com/post/[^#]*?)[\"']{1}").toStrings(); + //a()表示提取链接,as()表示提取所有链接 + //getHtml()返回Html对象,支持链式调用 + //r()表示用正则表达式提取一条内容,rs()表示提取多条内容 + //toString()表示取单条结果,toStrings()表示取多条 + List requests = page.getHtml().as().rs("(.*/post/.*)").toStrings(); + //使用page.addTargetRequests()方法将待抓取的链接加入队列 page.addTargetRequests(requests); - page.putField("title",page.getHtml().x("//div[@id='content']//h2/a")); - page.putField("content",page.getHtml().sc()); + //page.putField(key,value)将抽取的内容加入结果Map + //x()和xs()使用xpath进行抽取 + page.putField("title", page.getHtml().x("//title").r("(.*?)\\|")); + //sc()使用readability技术直接抽取正文,对于规整的文本有比较好的抽取正确率 + page.putField("content", page.getHtml().sc()); + page.putField("date", page.getUrl().r("post/(\\d+-\\d+-\\d+)/")); + page.putField("id", page.getUrl().r("post/\\d+-\\d+-\\d+/(\\d+)")); } @Override public Site getSite() { - return Site.me().setDomain("www.diandian.com").setStartUrl("http://17dujingdian.com/"). - setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31"); + //site定义抽取配置,以及开始url等 + if (site == null) { + site = Site.me().setDomain("progressdaily.diandian.com").setStartUrl("http://progressdaily.diandian.com/"). + setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31"); + } + return site; } } diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DiaoyuwengProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DiaoyuwengProcessor.java index 33b86bb9..03389f5b 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DiaoyuwengProcessor.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DiaoyuwengProcessor.java @@ -13,6 +13,9 @@ import java.util.List; * Time: 下午8:08 */ public class DiaoyuwengProcessor implements PageProcessor { + + private Site site; + @Override public void process(Page page) { List requests = page.getHtml().as().rs("(http://www\\.diaoyuweng\\.com/home\\.php\\?mod=space&uid=88304&do=thread&view=me&type=thread&order=dateline&from=space&page=\\d+)").toStrings(); @@ -29,7 +32,10 @@ public class DiaoyuwengProcessor implements PageProcessor { @Override public Site getSite() { - return Site.me().setDomain("www.diaoyuweng.com").setStartUrl("http://www.diaoyuweng.com/home.php?mod=space&uid=88304&do=thread&view=me&type=thread&from=space"). - setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31").setEncoding("GBK").setSleepTime(500); + if (site==null){ + site= Site.me().setDomain("www.diaoyuweng.com").setStartUrl("http://www.diaoyuweng.com/home.php?mod=space&uid=88304&do=thread&view=me&type=thread&from=space"). + setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31").setEncoding("GBK").setSleepTime(500); + } + return site; } } diff --git a/webmagic-samples/src/main/resources/combine.sh b/webmagic-samples/src/main/resources/combine.sh index 5a2c0c2d..0e7bd0c8 100644 --- a/webmagic-samples/src/main/resources/combine.sh +++ b/webmagic-samples/src/main/resources/combine.sh @@ -3,6 +3,6 @@ touch wordpress.xml cat wp-head.xml >> wordpress.xml for f in `ls`; do - cat ${f} >> wordpress.xml + cat ${f} >> ../wordpress.xml done; cat wp-bottom.xml >> wordpress.xml \ No newline at end of file diff --git a/webmagic-samples/src/test/java/us/codecraft/webmagic/processor/DiandianProcessorTest.java b/webmagic-samples/src/test/java/us/codecraft/webmagic/processor/DiandianProcessorTest.java new file mode 100644 index 00000000..18b0680d --- /dev/null +++ b/webmagic-samples/src/test/java/us/codecraft/webmagic/processor/DiandianProcessorTest.java @@ -0,0 +1,34 @@ +package us.codecraft.webmagic.processor; + +import org.junit.Test; +import us.codecraft.webmagic.Spider; +import us.codecraft.webmagic.pipeline.ConsolePipeline; +import us.codecraft.webmagic.pipeline.FreemarkerPipeline; +import us.codecraft.webmagic.samples.DiandianBlogProcessor; +import us.codecraft.webmagic.schedular.FileCacheQueueSchedular; + +import java.io.IOException; + +/** + * User: cairne + * Date: 13-6-9 + * Time: 上午8:02 + */ +public class DiandianProcessorTest { + + @Test + public void test() throws IOException { + DiandianBlogProcessor diaoyuwengProcessor = new DiandianBlogProcessor(); + //pipeline是抓取结束后的处理 + //ftl文件放到classpath:ftl/文件夹下 + //默认放到/data/temp/webmagic/ftl/[domain]目录下 + FreemarkerPipeline pipeline = new FreemarkerPipeline("wordpress.ftl"); + //Spider.me()是简化写法,其实就是new一个啦 + //Spider.pipeline()设定一个pipeline,支持链式调用 + //ConsolePipeline输出结果到控制台 + //FileCacheQueueSchedular保存url,支持断点续传,临时文件输出到/data/temp/webmagic/cache目录 + //Spider.run()执行 + Spider.me().pipeline(new ConsolePipeline()).pipeline(pipeline).schedular(new FileCacheQueueSchedular(diaoyuwengProcessor.getSite(), "/data/temp/webmagic/cache/")). + processor(diaoyuwengProcessor).run(); + } +}