diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/ConsolePipeline.java b/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/ConsolePipeline.java index 8be5fabb..97470e04 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/ConsolePipeline.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/ConsolePipeline.java @@ -20,6 +20,15 @@ public class ConsolePipeline implements Pipeline{ } System.out.println("get page: "+resultItems.getRequest().getUrl()); for (Map.Entry entry : resultItems.getAll().entrySet()) { + if (entry.getValue() instanceof Iterable) { + Iterable value = (Iterable) entry.getValue(); + System.out.println(entry.getKey() + ":"); + for (Object o : value) { + System.out.println(o); + } + } else { + System.out.println(entry.getKey() + ":\t" + entry.getValue()); + } System.out.println(entry.getKey()+":\t"+entry.getValue()); } } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/FilePipeline.java b/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/FilePipeline.java index cbce8324..01f8d8b8 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/FilePipeline.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/FilePipeline.java @@ -13,9 +13,10 @@ import java.util.Map; /** * 持久化到文件的接口。 + * * @author code4crafter@gmail.com
- * Date: 13-4-21 - * Time: 下午6:28 + * Date: 13-4-21 + * Time: 下午6:28 */ public class FilePipeline implements Pipeline { @@ -32,6 +33,7 @@ public class FilePipeline implements Pipeline { /** * 新建一个FilePipeline + * * @param path 文件保存路径 */ public FilePipeline(String path) { @@ -45,18 +47,26 @@ public class FilePipeline implements Pipeline { if (!file.exists()) { file.mkdirs(); } - if (resultItems.isSkip()){ + if (resultItems.isSkip()) { return; } try { - PrintWriter printWriter = new PrintWriter(new FileWriter(path + DigestUtils.md5Hex(resultItems.getRequest().getUrl())+".html")); + PrintWriter printWriter = new PrintWriter(new FileWriter(path + DigestUtils.md5Hex(resultItems.getRequest().getUrl()) + ".html")); printWriter.println("url:\t" + resultItems.getRequest().getUrl()); for (Map.Entry entry : resultItems.getAll().entrySet()) { - printWriter.println(entry.getKey()+":\t"+entry.getValue()); + if (entry.getValue() instanceof Iterable) { + Iterable value = (Iterable) entry.getValue(); + printWriter.println(entry.getKey() + ":"); + for (Object o : value) { + printWriter.println(o); + } + } else { + printWriter.println(entry.getKey() + ":\t" + entry.getValue()); + } } printWriter.close(); } catch (IOException e) { - logger.warn("write file error",e); + logger.warn("write file error", e); } } } diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/InfoQMiniBookProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/InfoQMiniBookProcessor.java new file mode 100644 index 00000000..b43c3c56 --- /dev/null +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/InfoQMiniBookProcessor.java @@ -0,0 +1,49 @@ +package us.codecraft.webmagic.samples; + +import org.apache.commons.collections.CollectionUtils; +import us.codecraft.webmagic.Page; +import us.codecraft.webmagic.Site; +import us.codecraft.webmagic.Spider; +import us.codecraft.webmagic.pipeline.FilePipeline; +import us.codecraft.webmagic.processor.PageProcessor; +import us.codecraft.webmagic.scheduler.RedisScheduler; + +import java.util.List; + +/** + * @author code4crafter@gmail.com
+ * Date: 13-4-21 + * Time: 下午8:08 + */ +public class InfoQMiniBookProcessor implements PageProcessor { + + private Site site; + + @Override + public void process(Page page) { + page.addTargetRequests(page.getHtml().links().regex("http://www\\.infoq\\.com/cn/minibooks/.*").all()); + List all = page.getHtml().links().regex(".*\\.pdf").all(); + if (CollectionUtils.isNotEmpty(all)) { + page.putField("pdf", all); + } else { + page.getResultItems().setSkip(true); + } + } + + @Override + public Site getSite() { + if (site == null) { + site = Site.me().setDomain("www.infoq.com").addStartUrl("http://www.infoq.com/cn/minibooks").addCookie("RegisteredUserCookie", "sDDDc8dIAgZSq67uJSXhtpQaHEi1XDOH"). + setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31"); + } + return site; + } + + public static void main(String[] args) { + Spider.create(new InfoQMiniBookProcessor()) + .scheduler(new RedisScheduler("localhost")) + .pipeline(new FilePipeline("/data/temp/webmagic/")) + .thread(5) + .run(); + } +}