diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/ConsolePageModelPipeline.java b/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/ConsolePageModelPipeline.java new file mode 100644 index 00000000..9991b7f7 --- /dev/null +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/ConsolePageModelPipeline.java @@ -0,0 +1,16 @@ +package us.codecraft.webmagic.annotation; + +import org.apache.commons.lang3.builder.ToStringBuilder; +import us.codecraft.webmagic.Task; + +/** + * @author yihua.huang@dianping.com
+ * @date: 13-8-3
+ * Time: 下午3:41
+ */ +public class ConsolePageModelPipeline implements PageModelPipeline { + @Override + public void process(Object o, Task task) { + System.out.println(ToStringBuilder.reflectionToString(o)); + } +} diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/OOSpider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/OOSpider.java index c6ae2f31..e5007455 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/OOSpider.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/OOSpider.java @@ -2,28 +2,57 @@ package us.codecraft.webmagic.annotation; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Spider; -import us.codecraft.webmagic.processor.PageProcessor; +import us.codecraft.webmagic.pipeline.Pipeline; /** * @author code4crafter@gmail.com
* @date: 13-8-3
* Time: 上午9:51
*/ -public class OOSpider extends Spider{ +public class OOSpider extends Spider { /** - * 使用已定义的抽取规则新建一个Spider。 + * OOSpider只能由ObjectPageProcessor创建。 * * @param pageProcessor 已定义的抽取规则 */ - public OOSpider(PageProcessor pageProcessor) { - super(pageProcessor); + + private ObjectPageProcessor objectPageProcessor; + + private ObjectPipeline objectPipeline; + + protected OOSpider(ObjectPageProcessor objectPageProcessor) { + super(objectPageProcessor); + this.objectPageProcessor = objectPageProcessor; + } + + public OOSpider(Site site, PageModelPipeline pageModelPipeline, Class... pageModels) { + this(ObjectPageProcessor.create(site, pageModels)); + this.objectPipeline = new ObjectPipeline(); + super.pipeline(objectPipeline); + for (Class pageModel : pageModels) { + this.objectPipeline.put(pageModel, pageModelPipeline); + } + } + + public static OOSpider create(Site site, Class... pageModels) { + return new OOSpider(site, new ConsolePageModelPipeline(), pageModels); + } + + public static OOSpider create(Site site, PageModelPipeline pageModelPipeline, Class... pageModels) { + return new OOSpider(site, pageModelPipeline, pageModels); + } + + public OOSpider addPageModel(PageModelPipeline pageModelPipeline, Class... pageModels) { + for (Class pageModel : pageModels) { + objectPageProcessor.addPageModel(pageModel); + objectPipeline.put(pageModel, pageModelPipeline); + } + return this; } - public static OOSpider create(Site site,Class... pageModels) { - OOSpider ooSpider = new OOSpider(ObjectPageProcessor.create(site, pageModels)); - ooSpider.pipeline(new ObjectPipeline()); - return ooSpider; + public Spider pipeline(Pipeline pipeline) { + throw new UnsupportedOperationException("Sorry, OOSpider can only use ObjectPipeline"); } } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/ObjectPageProcessor.java b/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/ObjectPageProcessor.java index 063dc818..f3758688 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/ObjectPageProcessor.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/ObjectPageProcessor.java @@ -18,30 +18,31 @@ import java.util.regex.Pattern; */ public class ObjectPageProcessor implements PageProcessor { - private List pageModelExtractorList; + private List pageModelExtractorList = new ArrayList(); private Site site; - private Set targetUrlPatterns; + private Set targetUrlPatterns = new HashSet(); public static ObjectPageProcessor create(Site site, Class... clazzs) { - List pageModelExtractorList = new ArrayList(); + ObjectPageProcessor objectPageProcessor = new ObjectPageProcessor(site); for (Class clazz : clazzs) { - PageModelExtractor pageModelExtractor = PageModelExtractor.create(clazz); - pageModelExtractorList.add(pageModelExtractor); + objectPageProcessor.addPageModel(clazz); } - ObjectPageProcessor objectPageProcessor = new ObjectPageProcessor(site, pageModelExtractorList); return objectPageProcessor; } - private ObjectPageProcessor(Site site, List pageModelExtractorList) { + + public ObjectPageProcessor addPageModel(Class clazz){ + PageModelExtractor pageModelExtractor = PageModelExtractor.create(clazz); + targetUrlPatterns.addAll(pageModelExtractor.getTargetUrlPatterns()); + targetUrlPatterns.addAll(pageModelExtractor.getHelpUrlPatterns()); + pageModelExtractorList.add(pageModelExtractor); + return this; + } + + private ObjectPageProcessor(Site site) { this.site = site; - this.pageModelExtractorList = pageModelExtractorList; - targetUrlPatterns = new HashSet(); - for (PageModelExtractor pageModelExtractor : pageModelExtractorList) { - targetUrlPatterns.addAll(pageModelExtractor.getTargetUrlPatterns()); - targetUrlPatterns.addAll(pageModelExtractor.getHelpUrlPatterns()); - } } @Override diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/PageModelExtractor.java b/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/PageModelExtractor.java index 8ed3b6b8..f91252fa 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/PageModelExtractor.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/annotation/PageModelExtractor.java @@ -22,9 +22,9 @@ import java.util.regex.Pattern; */ class PageModelExtractor { - private List targetUrlPatterns; + private List targetUrlPatterns = new ArrayList(); - private List helpUrlPatterns; + private List helpUrlPatterns = new ArrayList(); private Class clazz; @@ -106,7 +106,6 @@ class PageModelExtractor { } private void initTargetUrlPatterns() { - targetUrlPatterns = new ArrayList(); Annotation annotation = clazz.getAnnotation(TargetUrl.class); if (annotation == null) { targetUrlPatterns.add(Pattern.compile(".*")); @@ -116,7 +115,6 @@ class PageModelExtractor { targetUrlPatterns.add(Pattern.compile(s.replace(".", "\\.").replace("*", "[^\"'#]*"))); } } - helpUrlPatterns = new ArrayList(); annotation = clazz.getAnnotation(HelpUrl.class); if (annotation != null) { String[] value = ((HelpUrl) annotation).value(); diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/annotation/TestFetcher.java b/webmagic-core/src/test/java/us/codecraft/webmagic/annotation/TestFetcher.java index b29d0533..00264314 100644 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/annotation/TestFetcher.java +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/annotation/TestFetcher.java @@ -1,6 +1,5 @@ package us.codecraft.webmagic.annotation; -import org.junit.Ignore; import org.junit.Test; import us.codecraft.webmagic.Site; @@ -11,13 +10,11 @@ import us.codecraft.webmagic.Site; */ public class TestFetcher { - @Ignore("takes long") +// @Ignore("takes long") @Test public void test() { - ObjectPipeline objectPipeline = new ObjectPipeline(); OOSpider.create(Site.me().addStartUrl("http://my.oschina.net/flashsword/blog/145796"), OschinaBlog.class) - .pipeline(objectPipeline); - OschinaBlog oschinaBlog = null; + .run(); }