diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/oo/ExtractBy.java b/webmagic-core/src/main/java/us/codecraft/webmagic/oo/ExtractBy.java index 00ff7fb7..71bdc937 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/oo/ExtractBy.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/oo/ExtractBy.java @@ -10,16 +10,17 @@ import java.lang.annotation.Target; * Time: 下午8:40
*/ @Retention(java.lang.annotation.RetentionPolicy.RUNTIME) -@Target({ElementType.FIELD}) +@Target({ElementType.FIELD,ElementType.TYPE}) public @interface ExtractBy { - - //TODO: add list support String value(); - public enum Type {XPath, Regex, Css}; + public enum Type {XPath2, XPath, Regex, Css} - Type type() default Type.XPath; + Type type() default Type.XPath2; boolean notNull() default true; + + boolean multi() default false; + } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/oo/ExtractByUrl.java b/webmagic-core/src/main/java/us/codecraft/webmagic/oo/ExtractByUrl.java index 715112ca..e86f08f1 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/oo/ExtractByUrl.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/oo/ExtractByUrl.java @@ -17,4 +17,6 @@ public @interface ExtractByUrl{ boolean notNull() default true; + boolean multi() default false; + } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/oo/Extractor.java b/webmagic-core/src/main/java/us/codecraft/webmagic/oo/Extractor.java new file mode 100644 index 00000000..f0607cfd --- /dev/null +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/oo/Extractor.java @@ -0,0 +1,40 @@ +package us.codecraft.webmagic.oo; + +import us.codecraft.webmagic.selector.Selector; + +/** + * @author code4crafter@gmail.com
+ * @date: 13-8-1
+ * Time: 下午9:48
+ */ +class Extractor { + + protected final Selector selector; + + protected final Source source; + + protected final boolean notNull; + + protected final boolean multi; + + static enum Source {Html, Url} + + public Extractor(Selector selector, Source source, boolean notNull, boolean multi) { + this.selector = selector; + this.source = source; + this.notNull = notNull; + this.multi = multi; + } + + Selector getSelector() { + return selector; + } + + Source getSource() { + return source; + } + + boolean isNotNull() { + return notNull; + } +} diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/oo/FieldExtractor.java b/webmagic-core/src/main/java/us/codecraft/webmagic/oo/FieldExtractor.java index 26c1ec6e..2a6bcf72 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/oo/FieldExtractor.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/oo/FieldExtractor.java @@ -10,25 +10,15 @@ import java.lang.reflect.Method; * @date: 13-8-1
* Time: 下午9:48
*/ -class FieldExtractor { +class FieldExtractor extends Extractor{ private final Field field; - private final Selector selector; - - private final Source source; - private Method setterMethod; - private final boolean notNull; - - static enum Source {Html, Url} - - public FieldExtractor(Field field, Selector selector, Source source, boolean notNull) { + public FieldExtractor(Field field, Selector selector, Source source, boolean notNull,boolean multi) { + super(selector, source, notNull,multi); this.field = field; - this.selector = selector; - this.source = source; - this.notNull = notNull; } Field getField() { diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/oo/OOSpider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/oo/OOSpider.java index d41ee9f8..5f523ed3 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/oo/OOSpider.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/oo/OOSpider.java @@ -2,7 +2,6 @@ package us.codecraft.webmagic.oo; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Spider; -import us.codecraft.webmagic.pipeline.Pipeline; /** * @author code4crafter@gmail.com
@@ -50,8 +49,4 @@ public class OOSpider extends Spider { return this; } - public Spider pipeline(Pipeline pipeline) { - throw new UnsupportedOperationException("Sorry, OOSpider can only use ObjectPipeline"); - } - } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/oo/PageModelExtractor.java b/webmagic-core/src/main/java/us/codecraft/webmagic/oo/PageModelExtractor.java index dc1ef82f..e743e06a 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/oo/PageModelExtractor.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/oo/PageModelExtractor.java @@ -2,10 +2,7 @@ package us.codecraft.webmagic.oo; import org.apache.commons.lang3.StringUtils; import us.codecraft.webmagic.Page; -import us.codecraft.webmagic.selector.CssSelector; -import us.codecraft.webmagic.selector.RegexSelector; -import us.codecraft.webmagic.selector.Selector; -import us.codecraft.webmagic.selector.XpathSelector; +import us.codecraft.webmagic.selector.*; import java.lang.annotation.Annotation; import java.lang.reflect.Field; @@ -42,20 +39,22 @@ class PageModelExtractor { this.clazz = clazz; initTargetUrlPatterns(); fieldExtractors = new ArrayList(); - if (clazz.isAssignableFrom(AfterExtractor.class)){ + if (clazz.isAssignableFrom(AfterExtractor.class)) { try { - afterExtractor=(AfterExtractor)clazz.newInstance(); + afterExtractor = (AfterExtractor) clazz.newInstance(); } catch (Exception e) { throw new IllegalArgumentException(e); } } for (Field field : clazz.getDeclaredFields()) { field.setAccessible(true); - if (!field.getType().isAssignableFrom(String.class)){ - throw new IllegalStateException("Field "+field.getName()+" must be string"); - } ExtractBy extractBy = field.getAnnotation(ExtractBy.class); if (extractBy != null) { + if (!extractBy.multi() && !field.getType().isAssignableFrom(String.class)) { + throw new IllegalStateException("Field " + field.getName() + " must be string"); + } else if (extractBy.multi() && !field.getType().isAssignableFrom(List.class)) { + throw new IllegalStateException("Field " + field.getName() + " must be list"); + } String value = extractBy.value(); Selector selector; switch (extractBy.type()) { @@ -68,10 +67,13 @@ class PageModelExtractor { case XPath: selector = new XpathSelector(value); break; + case XPath2: + selector = new Xpath2Selector(value); + break; default: - selector = new XpathSelector(value); + selector = new Xpath2Selector(value); } - FieldExtractor fieldExtractor = new FieldExtractor(field, selector, FieldExtractor.Source.Html, extractBy.notNull()); + FieldExtractor fieldExtractor = new FieldExtractor(field, selector, FieldExtractor.Source.Html, extractBy.notNull(), extractBy.multi()); Method setterMethod = getSetterMethod(clazz, field); if (setterMethod != null) { fieldExtractor.setSetterMethod(setterMethod); @@ -80,11 +82,16 @@ class PageModelExtractor { } ExtractByUrl extractByUrl = field.getAnnotation(ExtractByUrl.class); if (extractByUrl != null) { + if (!extractByUrl.multi() && !field.getType().isAssignableFrom(String.class)) { + throw new IllegalStateException("Field " + field.getName() + " must be string"); + } else if (extractByUrl.multi() && !field.getType().isAssignableFrom(List.class)) { + throw new IllegalStateException("Field " + field.getName() + " must be list"); + } String regexPattern = extractByUrl.value(); if (regexPattern.trim().equals("")) { regexPattern = ".*"; } - FieldExtractor fieldExtractor = new FieldExtractor(field, new RegexSelector(regexPattern), FieldExtractor.Source.Url, extractByUrl.notNull()); + FieldExtractor fieldExtractor = new FieldExtractor(field, new RegexSelector(regexPattern), FieldExtractor.Source.Url, extractByUrl.notNull(), extractByUrl.multi()); Method setterMethod = getSetterMethod(clazz, field); if (setterMethod != null) { fieldExtractor.setSetterMethod(setterMethod); @@ -138,24 +145,42 @@ class PageModelExtractor { try { o = clazz.newInstance(); for (FieldExtractor fieldExtractor : fieldExtractors) { - String value; - switch (fieldExtractor.getSource()) { - case Html: - value = fieldExtractor.getSelector().select(page.getHtml().toString()); - break; - case Url: - value = fieldExtractor.getSelector().select(page.getUrl().toString()); - break; - default: - value = fieldExtractor.getSelector().select(page.getHtml().toString()); - } - if (value==null&&fieldExtractor.isNotNull()){ - page.getResultItems().setSkip(true); + if (fieldExtractor.multi) { + List value; + switch (fieldExtractor.getSource()) { + case Html: + value = fieldExtractor.getSelector().selectList(page.getHtml().toString()); + break; + case Url: + value = fieldExtractor.getSelector().selectList(page.getUrl().toString()); + break; + default: + value = fieldExtractor.getSelector().selectList(page.getHtml().toString()); + } + if ((value == null || value.size() == 0) && fieldExtractor.isNotNull()) { + page.getResultItems().setSkip(true); + } + setField(o, fieldExtractor, value); + } else { + String value; + switch (fieldExtractor.getSource()) { + case Html: + value = fieldExtractor.getSelector().select(page.getHtml().toString()); + break; + case Url: + value = fieldExtractor.getSelector().select(page.getUrl().toString()); + break; + default: + value = fieldExtractor.getSelector().select(page.getHtml().toString()); + } + if (value == null && fieldExtractor.isNotNull()) { + page.getResultItems().setSkip(true); + } + setField(o, fieldExtractor, value); } - setField(o, fieldExtractor, value); } - if (afterExtractor!=null){ - afterExtractor.afterProcess(page,o); + if (afterExtractor != null) { + afterExtractor.afterProcess(page, o); } } catch (InstantiationException e) { e.printStackTrace(); @@ -167,7 +192,7 @@ class PageModelExtractor { return o; } - private void setField(Object o, FieldExtractor fieldExtractor, String value) throws IllegalAccessException, InvocationTargetException { + private void setField(Object o, FieldExtractor fieldExtractor, Object value) throws IllegalAccessException, InvocationTargetException { if (fieldExtractor.getSetterMethod() != null) { fieldExtractor.getSetterMethod().invoke(o, value); } diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/oo/OschinaBlog.java b/webmagic-core/src/test/java/us/codecraft/webmagic/oo/OschinaBlog.java index 937eba17..b8c7e4a8 100644 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/oo/OschinaBlog.java +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/oo/OschinaBlog.java @@ -1,5 +1,7 @@ package us.codecraft.webmagic.oo; +import java.util.List; + /** * @author yihua.huang@dianping.com
* @date: 13-8-1
@@ -11,7 +13,10 @@ public class OschinaBlog { @ExtractBy("//title") private String title; - @ExtractBy(value = "div.BlogContent",type = ExtractBy.Type.Css) + @ExtractBy(value = "div.BlogContent", type = ExtractBy.Type.Css) private String content; + @ExtractBy(value = "//div[@class='BlogTags']/a/text()", multi = true) + private List tags; + } diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/oo/TestFetcher.java b/webmagic-core/src/test/java/us/codecraft/webmagic/oo/TestFetcher.java index 56f5a9ae..e8e3799a 100644 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/oo/TestFetcher.java +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/oo/TestFetcher.java @@ -1,6 +1,5 @@ package us.codecraft.webmagic.oo; -import org.junit.Ignore; import org.junit.Test; import us.codecraft.webmagic.Site; @@ -11,7 +10,7 @@ import us.codecraft.webmagic.Site; */ public class TestFetcher { - @Ignore("takes long") +// @Ignore("takes long") @Test public void test() { OOSpider.create(Site.me().addStartUrl("http://my.oschina.net/flashsword/blog/145796"), OschinaBlog.class)