diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/oo/ExtractBy.java b/webmagic-core/src/main/java/us/codecraft/webmagic/oo/ExtractBy.java
index 00ff7fb7..71bdc937 100644
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/oo/ExtractBy.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/oo/ExtractBy.java
@@ -10,16 +10,17 @@ import java.lang.annotation.Target;
* Time: 下午8:40
*/
@Retention(java.lang.annotation.RetentionPolicy.RUNTIME)
-@Target({ElementType.FIELD})
+@Target({ElementType.FIELD,ElementType.TYPE})
public @interface ExtractBy {
-
- //TODO: add list support
String value();
- public enum Type {XPath, Regex, Css};
+ public enum Type {XPath2, XPath, Regex, Css}
- Type type() default Type.XPath;
+ Type type() default Type.XPath2;
boolean notNull() default true;
+
+ boolean multi() default false;
+
}
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/oo/ExtractByUrl.java b/webmagic-core/src/main/java/us/codecraft/webmagic/oo/ExtractByUrl.java
index 715112ca..e86f08f1 100644
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/oo/ExtractByUrl.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/oo/ExtractByUrl.java
@@ -17,4 +17,6 @@ public @interface ExtractByUrl{
boolean notNull() default true;
+ boolean multi() default false;
+
}
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/oo/Extractor.java b/webmagic-core/src/main/java/us/codecraft/webmagic/oo/Extractor.java
new file mode 100644
index 00000000..f0607cfd
--- /dev/null
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/oo/Extractor.java
@@ -0,0 +1,40 @@
+package us.codecraft.webmagic.oo;
+
+import us.codecraft.webmagic.selector.Selector;
+
+/**
+ * @author code4crafter@gmail.com
+ * @date: 13-8-1
+ * Time: 下午9:48
+ */
+class Extractor {
+
+ protected final Selector selector;
+
+ protected final Source source;
+
+ protected final boolean notNull;
+
+ protected final boolean multi;
+
+ static enum Source {Html, Url}
+
+ public Extractor(Selector selector, Source source, boolean notNull, boolean multi) {
+ this.selector = selector;
+ this.source = source;
+ this.notNull = notNull;
+ this.multi = multi;
+ }
+
+ Selector getSelector() {
+ return selector;
+ }
+
+ Source getSource() {
+ return source;
+ }
+
+ boolean isNotNull() {
+ return notNull;
+ }
+}
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/oo/FieldExtractor.java b/webmagic-core/src/main/java/us/codecraft/webmagic/oo/FieldExtractor.java
index 26c1ec6e..2a6bcf72 100644
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/oo/FieldExtractor.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/oo/FieldExtractor.java
@@ -10,25 +10,15 @@ import java.lang.reflect.Method;
* @date: 13-8-1
* Time: 下午9:48
*/
-class FieldExtractor {
+class FieldExtractor extends Extractor{
private final Field field;
- private final Selector selector;
-
- private final Source source;
-
private Method setterMethod;
- private final boolean notNull;
-
- static enum Source {Html, Url}
-
- public FieldExtractor(Field field, Selector selector, Source source, boolean notNull) {
+ public FieldExtractor(Field field, Selector selector, Source source, boolean notNull,boolean multi) {
+ super(selector, source, notNull,multi);
this.field = field;
- this.selector = selector;
- this.source = source;
- this.notNull = notNull;
}
Field getField() {
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/oo/OOSpider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/oo/OOSpider.java
index d41ee9f8..5f523ed3 100644
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/oo/OOSpider.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/oo/OOSpider.java
@@ -2,7 +2,6 @@ package us.codecraft.webmagic.oo;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
-import us.codecraft.webmagic.pipeline.Pipeline;
/**
* @author code4crafter@gmail.com
@@ -50,8 +49,4 @@ public class OOSpider extends Spider {
return this;
}
- public Spider pipeline(Pipeline pipeline) {
- throw new UnsupportedOperationException("Sorry, OOSpider can only use ObjectPipeline");
- }
-
}
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/oo/PageModelExtractor.java b/webmagic-core/src/main/java/us/codecraft/webmagic/oo/PageModelExtractor.java
index dc1ef82f..e743e06a 100644
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/oo/PageModelExtractor.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/oo/PageModelExtractor.java
@@ -2,10 +2,7 @@ package us.codecraft.webmagic.oo;
import org.apache.commons.lang3.StringUtils;
import us.codecraft.webmagic.Page;
-import us.codecraft.webmagic.selector.CssSelector;
-import us.codecraft.webmagic.selector.RegexSelector;
-import us.codecraft.webmagic.selector.Selector;
-import us.codecraft.webmagic.selector.XpathSelector;
+import us.codecraft.webmagic.selector.*;
import java.lang.annotation.Annotation;
import java.lang.reflect.Field;
@@ -42,20 +39,22 @@ class PageModelExtractor {
this.clazz = clazz;
initTargetUrlPatterns();
fieldExtractors = new ArrayList();
- if (clazz.isAssignableFrom(AfterExtractor.class)){
+ if (clazz.isAssignableFrom(AfterExtractor.class)) {
try {
- afterExtractor=(AfterExtractor)clazz.newInstance();
+ afterExtractor = (AfterExtractor) clazz.newInstance();
} catch (Exception e) {
throw new IllegalArgumentException(e);
}
}
for (Field field : clazz.getDeclaredFields()) {
field.setAccessible(true);
- if (!field.getType().isAssignableFrom(String.class)){
- throw new IllegalStateException("Field "+field.getName()+" must be string");
- }
ExtractBy extractBy = field.getAnnotation(ExtractBy.class);
if (extractBy != null) {
+ if (!extractBy.multi() && !field.getType().isAssignableFrom(String.class)) {
+ throw new IllegalStateException("Field " + field.getName() + " must be string");
+ } else if (extractBy.multi() && !field.getType().isAssignableFrom(List.class)) {
+ throw new IllegalStateException("Field " + field.getName() + " must be list");
+ }
String value = extractBy.value();
Selector selector;
switch (extractBy.type()) {
@@ -68,10 +67,13 @@ class PageModelExtractor {
case XPath:
selector = new XpathSelector(value);
break;
+ case XPath2:
+ selector = new Xpath2Selector(value);
+ break;
default:
- selector = new XpathSelector(value);
+ selector = new Xpath2Selector(value);
}
- FieldExtractor fieldExtractor = new FieldExtractor(field, selector, FieldExtractor.Source.Html, extractBy.notNull());
+ FieldExtractor fieldExtractor = new FieldExtractor(field, selector, FieldExtractor.Source.Html, extractBy.notNull(), extractBy.multi());
Method setterMethod = getSetterMethod(clazz, field);
if (setterMethod != null) {
fieldExtractor.setSetterMethod(setterMethod);
@@ -80,11 +82,16 @@ class PageModelExtractor {
}
ExtractByUrl extractByUrl = field.getAnnotation(ExtractByUrl.class);
if (extractByUrl != null) {
+ if (!extractByUrl.multi() && !field.getType().isAssignableFrom(String.class)) {
+ throw new IllegalStateException("Field " + field.getName() + " must be string");
+ } else if (extractByUrl.multi() && !field.getType().isAssignableFrom(List.class)) {
+ throw new IllegalStateException("Field " + field.getName() + " must be list");
+ }
String regexPattern = extractByUrl.value();
if (regexPattern.trim().equals("")) {
regexPattern = ".*";
}
- FieldExtractor fieldExtractor = new FieldExtractor(field, new RegexSelector(regexPattern), FieldExtractor.Source.Url, extractByUrl.notNull());
+ FieldExtractor fieldExtractor = new FieldExtractor(field, new RegexSelector(regexPattern), FieldExtractor.Source.Url, extractByUrl.notNull(), extractByUrl.multi());
Method setterMethod = getSetterMethod(clazz, field);
if (setterMethod != null) {
fieldExtractor.setSetterMethod(setterMethod);
@@ -138,24 +145,42 @@ class PageModelExtractor {
try {
o = clazz.newInstance();
for (FieldExtractor fieldExtractor : fieldExtractors) {
- String value;
- switch (fieldExtractor.getSource()) {
- case Html:
- value = fieldExtractor.getSelector().select(page.getHtml().toString());
- break;
- case Url:
- value = fieldExtractor.getSelector().select(page.getUrl().toString());
- break;
- default:
- value = fieldExtractor.getSelector().select(page.getHtml().toString());
- }
- if (value==null&&fieldExtractor.isNotNull()){
- page.getResultItems().setSkip(true);
+ if (fieldExtractor.multi) {
+ List value;
+ switch (fieldExtractor.getSource()) {
+ case Html:
+ value = fieldExtractor.getSelector().selectList(page.getHtml().toString());
+ break;
+ case Url:
+ value = fieldExtractor.getSelector().selectList(page.getUrl().toString());
+ break;
+ default:
+ value = fieldExtractor.getSelector().selectList(page.getHtml().toString());
+ }
+ if ((value == null || value.size() == 0) && fieldExtractor.isNotNull()) {
+ page.getResultItems().setSkip(true);
+ }
+ setField(o, fieldExtractor, value);
+ } else {
+ String value;
+ switch (fieldExtractor.getSource()) {
+ case Html:
+ value = fieldExtractor.getSelector().select(page.getHtml().toString());
+ break;
+ case Url:
+ value = fieldExtractor.getSelector().select(page.getUrl().toString());
+ break;
+ default:
+ value = fieldExtractor.getSelector().select(page.getHtml().toString());
+ }
+ if (value == null && fieldExtractor.isNotNull()) {
+ page.getResultItems().setSkip(true);
+ }
+ setField(o, fieldExtractor, value);
}
- setField(o, fieldExtractor, value);
}
- if (afterExtractor!=null){
- afterExtractor.afterProcess(page,o);
+ if (afterExtractor != null) {
+ afterExtractor.afterProcess(page, o);
}
} catch (InstantiationException e) {
e.printStackTrace();
@@ -167,7 +192,7 @@ class PageModelExtractor {
return o;
}
- private void setField(Object o, FieldExtractor fieldExtractor, String value) throws IllegalAccessException, InvocationTargetException {
+ private void setField(Object o, FieldExtractor fieldExtractor, Object value) throws IllegalAccessException, InvocationTargetException {
if (fieldExtractor.getSetterMethod() != null) {
fieldExtractor.getSetterMethod().invoke(o, value);
}
diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/oo/OschinaBlog.java b/webmagic-core/src/test/java/us/codecraft/webmagic/oo/OschinaBlog.java
index 937eba17..b8c7e4a8 100644
--- a/webmagic-core/src/test/java/us/codecraft/webmagic/oo/OschinaBlog.java
+++ b/webmagic-core/src/test/java/us/codecraft/webmagic/oo/OschinaBlog.java
@@ -1,5 +1,7 @@
package us.codecraft.webmagic.oo;
+import java.util.List;
+
/**
* @author yihua.huang@dianping.com
* @date: 13-8-1
@@ -11,7 +13,10 @@ public class OschinaBlog {
@ExtractBy("//title")
private String title;
- @ExtractBy(value = "div.BlogContent",type = ExtractBy.Type.Css)
+ @ExtractBy(value = "div.BlogContent", type = ExtractBy.Type.Css)
private String content;
+ @ExtractBy(value = "//div[@class='BlogTags']/a/text()", multi = true)
+ private List tags;
+
}
diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/oo/TestFetcher.java b/webmagic-core/src/test/java/us/codecraft/webmagic/oo/TestFetcher.java
index 56f5a9ae..e8e3799a 100644
--- a/webmagic-core/src/test/java/us/codecraft/webmagic/oo/TestFetcher.java
+++ b/webmagic-core/src/test/java/us/codecraft/webmagic/oo/TestFetcher.java
@@ -1,6 +1,5 @@
package us.codecraft.webmagic.oo;
-import org.junit.Ignore;
import org.junit.Test;
import us.codecraft.webmagic.Site;
@@ -11,7 +10,7 @@ import us.codecraft.webmagic.Site;
*/
public class TestFetcher {
- @Ignore("takes long")
+// @Ignore("takes long")
@Test
public void test() {
OOSpider.create(Site.me().addStartUrl("http://my.oschina.net/flashsword/blog/145796"), OschinaBlog.class)