add combo extract to replace Extract2 Extract3...

12 years ago · 3ba7a76f44
parent f946fcdfea
commit 3ba7a76f44
13 changed files with 213 additions and 230 deletions
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/AndSelector.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/AndSelector.java
@ -5,8 +5,7 @@ import java.util.List;

 /**
 * @author code4crafter@gmail.com <br>
- * Date: 13-8-3 <br>
- * Time: 下午5:29 <br>
+ * @since 0.2.0
 */
 public class AndSelector implements Selector {

@ -18,6 +17,10 @@ public class AndSelector implements Selector {
        }
    }

+    public AndSelector(List<Selector> selectors) {
+        this.selectors = selectors;
+    }
+
    @Override
    public String select(String text) {
        for (Selector selector : selectors) {
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/OrSelector.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/OrSelector.java
@ -5,8 +5,7 @@ import java.util.List;

 /**
 * @author code4crafter@gmail.com <br>
- * Date: 13-8-3 <br>
- * Time: 下午5:29 <br>
+ * @since 0.2.0
 */
 public class OrSelector implements Selector {

@ -18,11 +17,15 @@ public class OrSelector implements Selector {
        }
    }

+    public OrSelector(List<Selector> selectors) {
+        this.selectors = selectors;
+    }
+
    @Override
    public String select(String text) {
        for (Selector selector : selectors) {
            text = selector.select(text);
-            if (text!=null){
+            if (text != null) {
                return text;
            }
        }
--- a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java
+++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java
@ -4,6 +4,7 @@ import org.apache.commons.lang3.StringUtils;
 import us.codecraft.webmagic.Page;
 import us.codecraft.webmagic.model.annotation.*;
 import us.codecraft.webmagic.selector.*;
+import us.codecraft.webmagic.utils.ExtractorUtils;

 import java.lang.annotation.Annotation;
 import java.lang.reflect.Field;
@ -49,20 +50,15 @@ class PageModelExtractor {
        for (Field field : clazz.getDeclaredFields()) {
            field.setAccessible(true);
            FieldExtractor fieldExtractor = getAnnotationExtractBy(clazz, field);
-            FieldExtractor fieldExtractorTmp = getAnnotationExtractByRaw(clazz, field);
+            FieldExtractor fieldExtractorTmp = getAnnotationExtractCombo(clazz, field);
            if (fieldExtractor != null && fieldExtractorTmp != null) {
-                throw new IllegalStateException("Only one of 'ExtractBy ExtractByRaw ExtractByUrl' can be added to a field!");
+                throw new IllegalStateException("Only one of 'ExtractBy ComboExtract ExtractByUrl' can be added to a field!");
            } else if (fieldExtractor == null && fieldExtractorTmp != null) {
                fieldExtractor = fieldExtractorTmp;
            }
-            // ExtractBy2 & ExtractBy3
-            if (fieldExtractor!=null){
-                addAnnotationExtractBy2(fieldExtractor);
-                addAnnotationExtractBy3(fieldExtractor);
-            }
            fieldExtractorTmp = getAnnotationExtractByUrl(clazz, field);
            if (fieldExtractor != null && fieldExtractorTmp != null) {
-                throw new IllegalStateException("Only one of 'ExtractBy ExtractByRaw ExtractByUrl' can be added to a field!");
+                throw new IllegalStateException("Only one of 'ExtractBy ComboExtract ExtractByUrl' can be added to a field!");
            } else if (fieldExtractor == null && fieldExtractorTmp != null) {
                fieldExtractor = fieldExtractorTmp;
            }
@ -94,26 +90,23 @@ class PageModelExtractor {
        return fieldExtractor;
    }

-    private FieldExtractor getAnnotationExtractBy(Class clazz, Field field) {
+    private FieldExtractor getAnnotationExtractCombo(Class clazz, Field field) {
        FieldExtractor fieldExtractor = null;
-        ExtractBy extractBy = field.getAnnotation(ExtractBy.class);
-        if (extractBy != null) {
-            String value = extractBy.value();
+        ComboExtract comboExtract = field.getAnnotation(ComboExtract.class);
+        if (comboExtract != null) {
+            ExtractBy[] extractBies = comboExtract.value();
            Selector selector;
-            switch (extractBy.type()) {
-                case Css:
-                    selector = new CssSelector(value);
-                    break;
-                case Regex:
-                    selector = new RegexSelector(value);
+            switch (comboExtract.op()) {
+                case And:
+                    selector = new AndSelector(ExtractorUtils.getSelectors(extractBies));
                    break;
-                case XPath:
-                    selector = new XpathSelector(value);
+                case Or:
+                    selector = new OrSelector(ExtractorUtils.getSelectors(extractBies));
                    break;
                default:
-                    selector = new XpathSelector(value);
+                    selector = new AndSelector(ExtractorUtils.getSelectors(extractBies));
            }
-            fieldExtractor = new FieldExtractor(field, selector, FieldExtractor.Source.Html, extractBy.notNull(), extractBy.multi());
+            fieldExtractor = new FieldExtractor(field, selector, FieldExtractor.Source.Html, comboExtract.notNull(), comboExtract.multi());
            Method setterMethod = getSetterMethod(clazz, field);
            if (setterMethod != null) {
                fieldExtractor.setSetterMethod(setterMethod);
@ -122,70 +115,12 @@ class PageModelExtractor {
        return fieldExtractor;
    }

-    private void addAnnotationExtractBy2(FieldExtractor fieldExtractor) {
-        ExtractBy2 extractBy = fieldExtractor.getField().getAnnotation(ExtractBy2.class);
-        if (extractBy != null) {
-            String value = extractBy.value();
-            Selector selector;
-            switch (extractBy.type()) {
-                case Css:
-                    selector = new CssSelector(value);
-                    break;
-                case Regex:
-                    selector = new RegexSelector(value);
-                    break;
-                case XPath:
-                    selector = new XpathSelector(value);
-                    break;
-                default:
-                    selector = new XpathSelector(value);
-            }
-            fieldExtractor.setSelector(new AndSelector(fieldExtractor.getSelector(), selector));
-        }
-    }
-
-    private void addAnnotationExtractBy3(FieldExtractor fieldExtractor) {
-        ExtractBy3 extractBy = fieldExtractor.getField().getAnnotation(ExtractBy3.class);
-        if (extractBy != null) {
-            String value = extractBy.value();
-            Selector selector;
-            switch (extractBy.type()) {
-                case Css:
-                    selector = new CssSelector(value);
-                    break;
-                case Regex:
-                    selector = new RegexSelector(value);
-                    break;
-                case XPath:
-                    selector = new XpathSelector(value);
-                    break;
-                default:
-                    selector = new XpathSelector(value);
-            }
-            fieldExtractor.setSelector(new AndSelector(fieldExtractor.getSelector(), selector));
-        }
-    }
-
-    private FieldExtractor getAnnotationExtractByRaw(Class clazz, Field field) {
+    private FieldExtractor getAnnotationExtractBy(Class clazz, Field field) {
        FieldExtractor fieldExtractor = null;
-        ExtractByRaw extractByRaw = field.getAnnotation(ExtractByRaw.class);
-        if (extractByRaw != null) {
-            String value = extractByRaw.value();
-            Selector selector;
-            switch (extractByRaw.type()) {
-                case Css:
-                    selector = new CssSelector(value);
-                    break;
-                case Regex:
-                    selector = new RegexSelector(value);
-                    break;
-                case XPath:
-                    selector = new XpathSelector(value);
-                    break;
-                default:
-                    selector = new XpathSelector(value);
-            }
-            fieldExtractor = new FieldExtractor(field, selector, FieldExtractor.Source.RawHtml, extractByRaw.notNull(), extractByRaw.multi());
+        ExtractBy extractBy = field.getAnnotation(ExtractBy.class);
+        if (extractBy != null) {
+            Selector selector = ExtractorUtils.getSelector(extractBy);
+            fieldExtractor = new FieldExtractor(field, selector, FieldExtractor.Source.Html, extractBy.notNull(), extractBy.multi());
            Method setterMethod = getSetterMethod(clazz, field);
            if (setterMethod != null) {
                fieldExtractor.setSetterMethod(setterMethod);
--- a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ComboExtract.java
+++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ComboExtract.java
@ -5,14 +5,75 @@ import java.lang.annotation.Retention;
 import java.lang.annotation.Target;

 /**
+ * Combo 'ExtractBy' extractor with and/or operator.
+ *
 * @author code4crafter@gmail.com <br>
- *         Date: 13-8-16 <br>
- *         Time: 下午11:09 <br>
+ * @since 0.2.1
 */
@Retention(java.lang.annotation.RetentionPolicy.RUNTIME)
@Target({ElementType.FIELD, ElementType.TYPE})
 public @interface ComboExtract {

+    /**
+     * The extractors to be combined.
+     *
+     * @return the extractors to be combined
+     */
+    ExtractBy[] value();

+    enum Op {
+        /**
+         * All extractors will be arranged as a pipeline. <br>
+         * The next extractor uses the result of the previous as source.
+         */
+        And,
+        /**
+         * All extractors will do extracting separately, <br>
+         * and the results of extractors will combined as the final result.
+         */
+        Or;
+    }
+
+    /**
+     * Combining operation of extractors.<br>
+     *
+     * @return combining operation of extractors
+     */
+    Op op() default Op.And;
+
+    /**
+     * Define whether the field can be null.<br>
+     * If set to 'true' and the extractor get no result, the entire class will be discarded. <br>
+     *
+     * @return whether the field can be null
+     */
+    boolean notNull() default false;
+
+    public enum Source {
+        /**
+         * extract from the content extracted by class extractor
+         */
+        SelectedHtml,
+        /**
+         * extract from the raw html
+         */
+        RawHtml
+    }
+
+    /**
+     * The source for extracting. <br>
+     * It works only if you already added 'ExtractBy' to Class. <br>
+     *
+     * @return the source for extracting
+     */
+    Source source() default Source.SelectedHtml;
+
+    /**
+     * Define whether the extractor return more than one result.
+     * When set to 'true', the extractor return a list of string (so you should define the field as List). <br>
+     *
+     * @return whether the extractor return more than one result
+     */
+    boolean multi() default false;

 }
--- a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ExtractBy.java
+++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ExtractBy.java
@ -5,45 +5,63 @@ import java.lang.annotation.Retention;
 import java.lang.annotation.Target;

 /**
- * 定义类或者字段的抽取规则。<br>
+ * Define the extractor for field or class。<br>
 *
 * @author code4crafter@gmail.com <br>
- * Date: 13-8-1 <br>
- * Time: 下午8:40 <br>
+ * @since 0.2.0
 */
@Retention(java.lang.annotation.RetentionPolicy.RUNTIME)
@Target({ElementType.FIELD, ElementType.TYPE})
 public @interface ExtractBy {

    /**
-     * 抽取规则
+     * Extractor expression, support XPath, CSS Selector and regex.
     *
-     * @return 抽取规则
+     * @return extractor expression
     */
    String value();

    public enum Type {XPath, Regex, Css}

    /**
-     * 抽取规则类型，支持XPath、Css selector、正则表达式，默认是XPath
+     * Extractor type, support XPath, CSS Selector and regex.
     *
-     * @return 抽取规则类型
+     * @return extractor type
     */
    Type type() default Type.XPath;

    /**
-     * 是否是不能为空的关键字段，若notNull为true，则对应字段抽取不到时，丢弃整个类，默认为false
+     * Define whether the field can be null.<br>
+     * If set to 'true' and the extractor get no result, the entire class will be discarded. <br>
     *
-     * @return 是否是不能为空的关键字段
+     * @return whether the field can be null
     */
    boolean notNull() default false;

+    public enum Source {
+        /**
+         * extract from the content extracted by class extractor
+         */
+        SelectedHtml,
+        /**
+         * extract from the raw html
+         */
+        RawHtml
+    }
+
+    /**
+     * The source for extracting. <br>
+     * It works only if you already added 'ExtractBy' to Class. <br>
+     *
+     * @return the source for extracting
+     */
+    Source source() default Source.SelectedHtml;
+
    /**
-     * 是否抽取多个结果<br>
-     * 用于字段时，需要List<String>来盛放结果<br>
-     * 用于类时，表示单页抽取多个对象<br>
+     * Define whether the extractor return more than one result.
+     * When set to 'true', the extractor return a list of string (so you should define the field as List). <br>
     *
-     * @return 是否抽取多个结果
+     * @return whether the extractor return more than one result
     */
    boolean multi() default false;

--- a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ExtractBy2.java
+++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ExtractBy2.java
@ -1,24 +0,0 @@
-package us.codecraft.webmagic.model.annotation;
-
-import java.lang.annotation.ElementType;
-import java.lang.annotation.Retention;
-import java.lang.annotation.Target;
-
-/**
- * 定义类或者字段的抽取规则，只能在Extract、ExtractByRaw之后使用。<br>
- *
- * @author code4crafter@gmail.com <br>
- * Date: 13-8-1 <br>
- * Time: 下午8:40 <br>
- */
-@Retention(java.lang.annotation.RetentionPolicy.RUNTIME)
-@Target({ElementType.FIELD})
-public @interface ExtractBy2 {
-
-    String value();
-
-    public enum Type {XPath, Regex, Css}
-
-    Type type() default Type.XPath;
-
-}
--- a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ExtractBy3.java
+++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ExtractBy3.java
@ -1,23 +0,0 @@
-package us.codecraft.webmagic.model.annotation;
-
-import java.lang.annotation.ElementType;
-import java.lang.annotation.Retention;
-import java.lang.annotation.Target;
-
-/**
- * 定义类或者字段的抽取规则，只能在Extract、ExtractByRaw之后使用。<br>
- * @author code4crafter@gmail.com <br>
- * Date: 13-8-1 <br>
- * Time: 下午8:40 <br>
- */
-@Retention(java.lang.annotation.RetentionPolicy.RUNTIME)
-@Target({ElementType.FIELD})
-public @interface ExtractBy3 {
-
-    String value();
-
-    public enum Type { XPath, Regex, Css}
-
-    Type type() default Type.XPath;
-
-}
--- a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ExtractByRaw.java
+++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ExtractByRaw.java
@ -1,49 +0,0 @@
-package us.codecraft.webmagic.model.annotation;
-
-import java.lang.annotation.ElementType;
-import java.lang.annotation.Retention;
-import java.lang.annotation.Target;
-
-/**
- * 对于在Class级别就使用过ExtractBy的类，在字段中想抽取全部内容可使用此方法。<br>
- *
- * @author code4crafter@gmail.com <br>
- * Date: 13-8-1 <br>
- * Time: 下午8:40 <br>
- */
-@Retention(java.lang.annotation.RetentionPolicy.RUNTIME)
-@Target({ElementType.FIELD, ElementType.TYPE})
-public @interface ExtractByRaw {
-
-    /**
-     * 抽取规则
-     *
-     * @return 抽取规则
-     */
-    String value();
-
-    public enum Type {XPath, Regex, Css}
-
-    /**
-     * 抽取规则类型，支持XPath、Css selector、正则表达式，默认是XPath
-     *
-     * @return 抽取规则类型
-     */
-    Type type() default Type.XPath;
-
-    /**
-     * 是否是不能为空的关键字段，若notNull为true，则对应字段抽取不到时，丢弃整个类，默认为false
-     *
-     * @return 是否是不能为空的关键字段
-     */
-    boolean notNull() default false;
-
-    /**
-     * 是否抽取多个结果<br>
-     * 需要List<String>来盛放结果<br>
-     *
-     * @return 是否抽取多个结果
-     */
-    boolean multi() default false;
-
-}
--- a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ExtractByUrl.java
+++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ExtractByUrl.java
@ -5,35 +5,35 @@ import java.lang.annotation.Retention;
 import java.lang.annotation.Target;

 /**
- * 定义类或者字段的抽取规则(从url中抽取，只支持正则表达式)。<br>
+ * Define a extractor for url. Only regex can be used. <br>
+ *
 * @author code4crafter@gmail.com <br>
- * Date: 13-8-1 <br>
- * Time: 下午8:40 <br>
+ * @since 0.2.0
 */
@Retention(java.lang.annotation.RetentionPolicy.RUNTIME)
@Target({ElementType.FIELD})
-public @interface ExtractByUrl{
+public @interface ExtractByUrl {

    /**
-     * 抽取规则，支持正则表达式
+     * Extractor expression, only regex can be used
     *
-     * @return 抽取规则
+     * @return extractor expression
     */
    String value() default "";

    /**
-     * 是否是不能为空的关键字段，若notNull为true，则对应字段抽取不到时，丢弃整个类，默认为false
+     * Define whether the field can be null.<br>
+     * If set to 'true' and the extractor get no result, the entire class will be discarded. <br>
     *
-     * @return 是否是不能为空的关键字段
+     * @return whether the field can be null
     */
    boolean notNull() default false;

    /**
-     * 是否抽取多个结果<br>
-     * 用于字段时，需要List<String>来盛放结果<br>
-     * 用于类时，表示单页抽取多个对象<br>
+     * Define whether the extractor return more than one result.
+     * When set to 'true', the extractor return a list of string (so you should define the field as List). <br>
     *
-     * @return 是否抽取多个结果
+     * @return whether the extractor return more than one result
     */
    boolean multi() default false;

--- a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/HelpUrl.java
+++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/HelpUrl.java
@ -5,26 +5,32 @@ import java.lang.annotation.Retention;
 import java.lang.annotation.Target;

 /**
- * 定义辅助爬取的url。<br>
+ * Define the 'help' url patterns for class. <br>
+ * All urls matching the pattern will be crawled and but not extracted for new objects. <br>
+ *
 * @author code4crafter@gmail.com <br>
- * Date: 13-8-1 <br>
- * Time: 下午8:40 <br>
+ * @since 0.2.0
 */
@Retention(java.lang.annotation.RetentionPolicy.RUNTIME)
@Target({ElementType.TYPE})
 public @interface HelpUrl {

    /**
-     * 某个类对应的URL规则列表<br>
-     * webmagic对正则表达式进行了修改，"."仅表示字符"."而不代表任意字符，而"\*"则代表了".\*"，例如"http://\*.oschina.net/\*"代表了oschina所有的二级域名下的URL。<br>
+     * The url patterns to crawl. <br>
+     * Use regex expression with some changes: <br>
+     *      "." stand for literal character "." instead of "any character". <br>
+     *      "*" stand for any legal character for url in 0-n length ([^"'#]*) instead of "any length". <br>
     *
-     * @return 抽取规则
+     * @return the url patterns for class
     */
    String[] value();

    /**
-     * 指定提取URL的区域(仅支持XPath)
-     * @return 指定提取URL的区域
+     * Define the region for url extracting. <br>
+     * Only support XPath.<br>
+     * When sourceRegion is set, the urls will be extracted only from the region instead of entire content. <br>
+     *
+     * @return the region for url extracting
     */
    String sourceRegion() default "";
 }
--- a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/TargetUrl.java
+++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/TargetUrl.java
@ -5,27 +5,32 @@ import java.lang.annotation.Retention;
 import java.lang.annotation.Target;

 /**
- * 定义某个类抽取的范围和来源，sourceRegion可以用xpath语法限定抽取区域。<br>
+ * Define the url patterns for class. <br>
+ * All urls matching the pattern will be crawled and extracted for new objects. <br>
 *
 * @author code4crafter@gmail.com <br>
- * Date: 13-8-1 <br>
- * Time: 下午8:40 <br>
+ * @since 0.2.0
 */
@Retention(java.lang.annotation.RetentionPolicy.RUNTIME)
@Target({ElementType.TYPE})
 public @interface TargetUrl {

    /**
-     * 某个类对应的URL规则列表<br>
-     * webmagic对正则表达式进行了修改，"."仅表示字符"."而不代表任意字符，而"\*"则代表了".\*"，例如"http://\*.oschina.net/\*"代表了oschina所有的二级域名下的URL。<br>
+     * The url patterns for class.<br>
+     * Use regex expression with some changes: <br>
+     *      "." stand for literal character "." instead of "any character". <br>
+     *      "*" stand for any legal character for url in 0-n length ([^"'#]*) instead of "any length". <br>
     *
-     * @return 抽取规则
+     * @return the url patterns for class
     */
    String[] value();

    /**
-     * 指定提取URL的区域(仅支持XPath)
-     * @return 指定提取URL的区域
+     * Define the region for url extracting. <br>
+     * Only support XPath.<br>
+     * When sourceRegion is set, the urls will be extracted only from the region instead of entire content. <br>
+     *
+     * @return the region for url extracting
     */
    String sourceRegion() default "";

--- a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/package.html
+++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/package.html
@ -1,5 +1,5 @@
 <html>
 	<body>
-webmagic注解抓取方式所定义的注解。
+Annotations for define a class.
 	</body>
 </html>
--- a/webmagic-extension/src/main/java/us/codecraft/webmagic/utils/ExtractorUtils.java
+++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/utils/ExtractorUtils.java
@ -0,0 +1,48 @@
+package us.codecraft.webmagic.utils;
+
+import us.codecraft.webmagic.model.annotation.ExtractBy;
+import us.codecraft.webmagic.selector.CssSelector;
+import us.codecraft.webmagic.selector.RegexSelector;
+import us.codecraft.webmagic.selector.Selector;
+import us.codecraft.webmagic.selector.XpathSelector;
+
+import java.util.ArrayList;
+import java.util.List;
+
+/**
+ * Tools for annotation converting. <br>
+ * @author code4crafter@gmail.com <br>
+ * @since 0.2.1
+ */
+public class ExtractorUtils {
+
+    public static Selector getSelector(ExtractBy extractBy) {
+        String value = extractBy.value();
+        Selector selector;
+        switch (extractBy.type()) {
+            case Css:
+                selector = new CssSelector(value);
+                break;
+            case Regex:
+                selector = new RegexSelector(value);
+                break;
+            case XPath:
+                selector = new XpathSelector(value);
+                break;
+            default:
+                selector = new XpathSelector(value);
+        }
+        return selector;
+    }
+
+    public static List<Selector> getSelectors(ExtractBy[] extractBies) {
+        List<Selector> selectors = new ArrayList<Selector>();
+        if (extractBies==null){
+            return selectors;
+        }
+        for (ExtractBy extractBy : extractBies) {
+            selectors.add(getSelector(extractBy));
+        }
+        return selectors;
+    }
+}