From 6cc1d62a08d34d74697f9df6e4c178767cc1f93a Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Sat, 17 Aug 2013 19:42:51 +0800 Subject: [PATCH] bugfix: rawhtml do not work --- .../java/us/codecraft/webmagic/downloader/FileCache.java | 2 ++ .../src/main/java/us/codecraft/webmagic/model/OOSpider.java | 6 ++++++ .../us/codecraft/webmagic/model/PageModelExtractor.java | 6 ++++-- 3 files changed, 12 insertions(+), 2 deletions(-) diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/downloader/FileCache.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/downloader/FileCache.java index a78a343b..163c75ba 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/downloader/FileCache.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/downloader/FileCache.java @@ -4,6 +4,7 @@ import org.apache.commons.codec.digest.DigestUtils; import org.apache.commons.lang3.StringUtils; import org.apache.log4j.Logger; import us.codecraft.webmagic.*; +import us.codecraft.webmagic.model.annotation.Experimental; import us.codecraft.webmagic.pipeline.Pipeline; import us.codecraft.webmagic.processor.PageProcessor; import us.codecraft.webmagic.processor.SimplePageProcessor; @@ -20,6 +21,7 @@ import java.io.*; * @author code4crafter@gmail.com * @since 0.2.1 */ +@Experimental public class FileCache extends FilePersistentBase implements Downloader, Pipeline, PageProcessor { private Downloader downloaderWhenFileMiss; diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/OOSpider.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/OOSpider.java index cbfd50f3..bff5a3a3 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/OOSpider.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/OOSpider.java @@ -23,6 +23,12 @@ import us.codecraft.webmagic.processor.PageProcessor; * private List tags; * } + * And start the spider by: + *
+ *   OOSpider.create(Site.me().addStartUrl("http://my.oschina.net/flashsword/blog")
+ *        ,new JsonFilePageModelPipeline(), OschinaBlog.class).run();
+ * }
+ 
* @author code4crafter@gmail.com
* @since 0.2.0 */ diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java index 522d491b..a16c7a1b 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java @@ -105,7 +105,8 @@ class PageModelExtractor { default: selector = new AndSelector(ExtractorUtils.getSelectors(extractBies)); } - fieldExtractor = new FieldExtractor(field, selector, FieldExtractor.Source.Html, comboExtract.notNull(), comboExtract.multi()); + fieldExtractor = new FieldExtractor(field, selector, comboExtract.source() == ComboExtract.Source.RawHtml ? FieldExtractor.Source.RawHtml : FieldExtractor.Source.Html, + comboExtract.notNull(), comboExtract.multi()); Method setterMethod = getSetterMethod(clazz, field); if (setterMethod != null) { fieldExtractor.setSetterMethod(setterMethod); @@ -119,7 +120,8 @@ class PageModelExtractor { ExtractBy extractBy = field.getAnnotation(ExtractBy.class); if (extractBy != null) { Selector selector = ExtractorUtils.getSelector(extractBy); - fieldExtractor = new FieldExtractor(field, selector, FieldExtractor.Source.Html, extractBy.notNull(), extractBy.multi()); + fieldExtractor = new FieldExtractor(field, selector, extractBy.source() == ExtractBy.Source.RawHtml ? FieldExtractor.Source.RawHtml : FieldExtractor.Source.Html, + extractBy.notNull(), extractBy.multi()); Method setterMethod = getSetterMethod(clazz, field); if (setterMethod != null) { fieldExtractor.setSetterMethod(setterMethod);