diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/AndSelector.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/AndSelector.java index 997b6cf1..f13c6ed7 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/AndSelector.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/AndSelector.java @@ -5,8 +5,7 @@ import java.util.List; /** * @author code4crafter@gmail.com
- * Date: 13-8-3
- * Time: 下午5:29
+ * @since 0.2.0 */ public class AndSelector implements Selector { @@ -18,6 +17,10 @@ public class AndSelector implements Selector { } } + public AndSelector(List selectors) { + this.selectors = selectors; + } + @Override public String select(String text) { for (Selector selector : selectors) { diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/OrSelector.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/OrSelector.java index 48f9fb93..4ece3222 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/OrSelector.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/OrSelector.java @@ -5,8 +5,7 @@ import java.util.List; /** * @author code4crafter@gmail.com
- * Date: 13-8-3
- * Time: 下午5:29
+ * @since 0.2.0 */ public class OrSelector implements Selector { @@ -18,11 +17,15 @@ public class OrSelector implements Selector { } } + public OrSelector(List selectors) { + this.selectors = selectors; + } + @Override public String select(String text) { for (Selector selector : selectors) { text = selector.select(text); - if (text!=null){ + if (text != null) { return text; } } diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java index 2f9004b5..043af109 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java @@ -4,6 +4,7 @@ import org.apache.commons.lang3.StringUtils; import us.codecraft.webmagic.Page; import us.codecraft.webmagic.model.annotation.*; import us.codecraft.webmagic.selector.*; +import us.codecraft.webmagic.utils.ExtractorUtils; import java.lang.annotation.Annotation; import java.lang.reflect.Field; @@ -49,20 +50,15 @@ class PageModelExtractor { for (Field field : clazz.getDeclaredFields()) { field.setAccessible(true); FieldExtractor fieldExtractor = getAnnotationExtractBy(clazz, field); - FieldExtractor fieldExtractorTmp = getAnnotationExtractByRaw(clazz, field); + FieldExtractor fieldExtractorTmp = getAnnotationExtractCombo(clazz, field); if (fieldExtractor != null && fieldExtractorTmp != null) { - throw new IllegalStateException("Only one of 'ExtractBy ExtractByRaw ExtractByUrl' can be added to a field!"); + throw new IllegalStateException("Only one of 'ExtractBy ComboExtract ExtractByUrl' can be added to a field!"); } else if (fieldExtractor == null && fieldExtractorTmp != null) { fieldExtractor = fieldExtractorTmp; } - // ExtractBy2 & ExtractBy3 - if (fieldExtractor!=null){ - addAnnotationExtractBy2(fieldExtractor); - addAnnotationExtractBy3(fieldExtractor); - } fieldExtractorTmp = getAnnotationExtractByUrl(clazz, field); if (fieldExtractor != null && fieldExtractorTmp != null) { - throw new IllegalStateException("Only one of 'ExtractBy ExtractByRaw ExtractByUrl' can be added to a field!"); + throw new IllegalStateException("Only one of 'ExtractBy ComboExtract ExtractByUrl' can be added to a field!"); } else if (fieldExtractor == null && fieldExtractorTmp != null) { fieldExtractor = fieldExtractorTmp; } @@ -94,26 +90,23 @@ class PageModelExtractor { return fieldExtractor; } - private FieldExtractor getAnnotationExtractBy(Class clazz, Field field) { + private FieldExtractor getAnnotationExtractCombo(Class clazz, Field field) { FieldExtractor fieldExtractor = null; - ExtractBy extractBy = field.getAnnotation(ExtractBy.class); - if (extractBy != null) { - String value = extractBy.value(); + ComboExtract comboExtract = field.getAnnotation(ComboExtract.class); + if (comboExtract != null) { + ExtractBy[] extractBies = comboExtract.value(); Selector selector; - switch (extractBy.type()) { - case Css: - selector = new CssSelector(value); - break; - case Regex: - selector = new RegexSelector(value); + switch (comboExtract.op()) { + case And: + selector = new AndSelector(ExtractorUtils.getSelectors(extractBies)); break; - case XPath: - selector = new XpathSelector(value); + case Or: + selector = new OrSelector(ExtractorUtils.getSelectors(extractBies)); break; default: - selector = new XpathSelector(value); + selector = new AndSelector(ExtractorUtils.getSelectors(extractBies)); } - fieldExtractor = new FieldExtractor(field, selector, FieldExtractor.Source.Html, extractBy.notNull(), extractBy.multi()); + fieldExtractor = new FieldExtractor(field, selector, FieldExtractor.Source.Html, comboExtract.notNull(), comboExtract.multi()); Method setterMethod = getSetterMethod(clazz, field); if (setterMethod != null) { fieldExtractor.setSetterMethod(setterMethod); @@ -122,70 +115,12 @@ class PageModelExtractor { return fieldExtractor; } - private void addAnnotationExtractBy2(FieldExtractor fieldExtractor) { - ExtractBy2 extractBy = fieldExtractor.getField().getAnnotation(ExtractBy2.class); - if (extractBy != null) { - String value = extractBy.value(); - Selector selector; - switch (extractBy.type()) { - case Css: - selector = new CssSelector(value); - break; - case Regex: - selector = new RegexSelector(value); - break; - case XPath: - selector = new XpathSelector(value); - break; - default: - selector = new XpathSelector(value); - } - fieldExtractor.setSelector(new AndSelector(fieldExtractor.getSelector(), selector)); - } - } - - private void addAnnotationExtractBy3(FieldExtractor fieldExtractor) { - ExtractBy3 extractBy = fieldExtractor.getField().getAnnotation(ExtractBy3.class); - if (extractBy != null) { - String value = extractBy.value(); - Selector selector; - switch (extractBy.type()) { - case Css: - selector = new CssSelector(value); - break; - case Regex: - selector = new RegexSelector(value); - break; - case XPath: - selector = new XpathSelector(value); - break; - default: - selector = new XpathSelector(value); - } - fieldExtractor.setSelector(new AndSelector(fieldExtractor.getSelector(), selector)); - } - } - - private FieldExtractor getAnnotationExtractByRaw(Class clazz, Field field) { + private FieldExtractor getAnnotationExtractBy(Class clazz, Field field) { FieldExtractor fieldExtractor = null; - ExtractByRaw extractByRaw = field.getAnnotation(ExtractByRaw.class); - if (extractByRaw != null) { - String value = extractByRaw.value(); - Selector selector; - switch (extractByRaw.type()) { - case Css: - selector = new CssSelector(value); - break; - case Regex: - selector = new RegexSelector(value); - break; - case XPath: - selector = new XpathSelector(value); - break; - default: - selector = new XpathSelector(value); - } - fieldExtractor = new FieldExtractor(field, selector, FieldExtractor.Source.RawHtml, extractByRaw.notNull(), extractByRaw.multi()); + ExtractBy extractBy = field.getAnnotation(ExtractBy.class); + if (extractBy != null) { + Selector selector = ExtractorUtils.getSelector(extractBy); + fieldExtractor = new FieldExtractor(field, selector, FieldExtractor.Source.Html, extractBy.notNull(), extractBy.multi()); Method setterMethod = getSetterMethod(clazz, field); if (setterMethod != null) { fieldExtractor.setSetterMethod(setterMethod); diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ComboExtract.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ComboExtract.java index 1f5f008c..02fa25b4 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ComboExtract.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ComboExtract.java @@ -5,14 +5,75 @@ import java.lang.annotation.Retention; import java.lang.annotation.Target; /** + * Combo 'ExtractBy' extractor with and/or operator. + * * @author code4crafter@gmail.com
- * Date: 13-8-16
- * Time: 下午11:09
+ * @since 0.2.1 */ @Retention(java.lang.annotation.RetentionPolicy.RUNTIME) @Target({ElementType.FIELD, ElementType.TYPE}) public @interface ComboExtract { + /** + * The extractors to be combined. + * + * @return the extractors to be combined + */ + ExtractBy[] value(); + enum Op { + /** + * All extractors will be arranged as a pipeline.
+ * The next extractor uses the result of the previous as source. + */ + And, + /** + * All extractors will do extracting separately,
+ * and the results of extractors will combined as the final result. + */ + Or; + } + + /** + * Combining operation of extractors.
+ * + * @return combining operation of extractors + */ + Op op() default Op.And; + + /** + * Define whether the field can be null.
+ * If set to 'true' and the extractor get no result, the entire class will be discarded.
+ * + * @return whether the field can be null + */ + boolean notNull() default false; + + public enum Source { + /** + * extract from the content extracted by class extractor + */ + SelectedHtml, + /** + * extract from the raw html + */ + RawHtml + } + + /** + * The source for extracting.
+ * It works only if you already added 'ExtractBy' to Class.
+ * + * @return the source for extracting + */ + Source source() default Source.SelectedHtml; + + /** + * Define whether the extractor return more than one result. + * When set to 'true', the extractor return a list of string (so you should define the field as List).
+ * + * @return whether the extractor return more than one result + */ + boolean multi() default false; } diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ExtractBy.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ExtractBy.java index 8c12ce1f..9e0ea18e 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ExtractBy.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ExtractBy.java @@ -5,45 +5,63 @@ import java.lang.annotation.Retention; import java.lang.annotation.Target; /** - * 定义类或者字段的抽取规则。
+ * Define the extractor for field or class。
* * @author code4crafter@gmail.com
- * Date: 13-8-1
- * Time: 下午8:40
+ * @since 0.2.0 */ @Retention(java.lang.annotation.RetentionPolicy.RUNTIME) @Target({ElementType.FIELD, ElementType.TYPE}) public @interface ExtractBy { /** - * 抽取规则 + * Extractor expression, support XPath, CSS Selector and regex. * - * @return 抽取规则 + * @return extractor expression */ String value(); public enum Type {XPath, Regex, Css} /** - * 抽取规则类型,支持XPath、Css selector、正则表达式,默认是XPath + * Extractor type, support XPath, CSS Selector and regex. * - * @return 抽取规则类型 + * @return extractor type */ Type type() default Type.XPath; /** - * 是否是不能为空的关键字段,若notNull为true,则对应字段抽取不到时,丢弃整个类,默认为false + * Define whether the field can be null.
+ * If set to 'true' and the extractor get no result, the entire class will be discarded.
* - * @return 是否是不能为空的关键字段 + * @return whether the field can be null */ boolean notNull() default false; + public enum Source { + /** + * extract from the content extracted by class extractor + */ + SelectedHtml, + /** + * extract from the raw html + */ + RawHtml + } + + /** + * The source for extracting.
+ * It works only if you already added 'ExtractBy' to Class.
+ * + * @return the source for extracting + */ + Source source() default Source.SelectedHtml; + /** - * 是否抽取多个结果
- * 用于字段时,需要List来盛放结果
- * 用于类时,表示单页抽取多个对象
+ * Define whether the extractor return more than one result. + * When set to 'true', the extractor return a list of string (so you should define the field as List).
* - * @return 是否抽取多个结果 + * @return whether the extractor return more than one result */ boolean multi() default false; diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ExtractBy2.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ExtractBy2.java deleted file mode 100644 index 2a4f0802..00000000 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ExtractBy2.java +++ /dev/null @@ -1,24 +0,0 @@ -package us.codecraft.webmagic.model.annotation; - -import java.lang.annotation.ElementType; -import java.lang.annotation.Retention; -import java.lang.annotation.Target; - -/** - * 定义类或者字段的抽取规则,只能在Extract、ExtractByRaw之后使用。
- * - * @author code4crafter@gmail.com
- * Date: 13-8-1
- * Time: 下午8:40
- */ -@Retention(java.lang.annotation.RetentionPolicy.RUNTIME) -@Target({ElementType.FIELD}) -public @interface ExtractBy2 { - - String value(); - - public enum Type {XPath, Regex, Css} - - Type type() default Type.XPath; - -} diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ExtractBy3.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ExtractBy3.java deleted file mode 100644 index 741682d4..00000000 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ExtractBy3.java +++ /dev/null @@ -1,23 +0,0 @@ -package us.codecraft.webmagic.model.annotation; - -import java.lang.annotation.ElementType; -import java.lang.annotation.Retention; -import java.lang.annotation.Target; - -/** - * 定义类或者字段的抽取规则,只能在Extract、ExtractByRaw之后使用。
- * @author code4crafter@gmail.com
- * Date: 13-8-1
- * Time: 下午8:40
- */ -@Retention(java.lang.annotation.RetentionPolicy.RUNTIME) -@Target({ElementType.FIELD}) -public @interface ExtractBy3 { - - String value(); - - public enum Type { XPath, Regex, Css} - - Type type() default Type.XPath; - -} diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ExtractByRaw.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ExtractByRaw.java deleted file mode 100644 index a3ae3e5c..00000000 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ExtractByRaw.java +++ /dev/null @@ -1,49 +0,0 @@ -package us.codecraft.webmagic.model.annotation; - -import java.lang.annotation.ElementType; -import java.lang.annotation.Retention; -import java.lang.annotation.Target; - -/** - * 对于在Class级别就使用过ExtractBy的类,在字段中想抽取全部内容可使用此方法。
- * - * @author code4crafter@gmail.com
- * Date: 13-8-1
- * Time: 下午8:40
- */ -@Retention(java.lang.annotation.RetentionPolicy.RUNTIME) -@Target({ElementType.FIELD, ElementType.TYPE}) -public @interface ExtractByRaw { - - /** - * 抽取规则 - * - * @return 抽取规则 - */ - String value(); - - public enum Type {XPath, Regex, Css} - - /** - * 抽取规则类型,支持XPath、Css selector、正则表达式,默认是XPath - * - * @return 抽取规则类型 - */ - Type type() default Type.XPath; - - /** - * 是否是不能为空的关键字段,若notNull为true,则对应字段抽取不到时,丢弃整个类,默认为false - * - * @return 是否是不能为空的关键字段 - */ - boolean notNull() default false; - - /** - * 是否抽取多个结果
- * 需要List来盛放结果
- * - * @return 是否抽取多个结果 - */ - boolean multi() default false; - -} diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ExtractByUrl.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ExtractByUrl.java index 51b5f0df..416bd89f 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ExtractByUrl.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ExtractByUrl.java @@ -5,35 +5,35 @@ import java.lang.annotation.Retention; import java.lang.annotation.Target; /** - * 定义类或者字段的抽取规则(从url中抽取,只支持正则表达式)。
+ * Define a extractor for url. Only regex can be used.
+ * * @author code4crafter@gmail.com
- * Date: 13-8-1
- * Time: 下午8:40
+ * @since 0.2.0 */ @Retention(java.lang.annotation.RetentionPolicy.RUNTIME) @Target({ElementType.FIELD}) -public @interface ExtractByUrl{ +public @interface ExtractByUrl { /** - * 抽取规则,支持正则表达式 + * Extractor expression, only regex can be used * - * @return 抽取规则 + * @return extractor expression */ String value() default ""; /** - * 是否是不能为空的关键字段,若notNull为true,则对应字段抽取不到时,丢弃整个类,默认为false + * Define whether the field can be null.
+ * If set to 'true' and the extractor get no result, the entire class will be discarded.
* - * @return 是否是不能为空的关键字段 + * @return whether the field can be null */ boolean notNull() default false; /** - * 是否抽取多个结果
- * 用于字段时,需要List来盛放结果
- * 用于类时,表示单页抽取多个对象
+ * Define whether the extractor return more than one result. + * When set to 'true', the extractor return a list of string (so you should define the field as List).
* - * @return 是否抽取多个结果 + * @return whether the extractor return more than one result */ boolean multi() default false; diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/HelpUrl.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/HelpUrl.java index 9a0cce4f..d986c1e1 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/HelpUrl.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/HelpUrl.java @@ -5,26 +5,32 @@ import java.lang.annotation.Retention; import java.lang.annotation.Target; /** - * 定义辅助爬取的url。
+ * Define the 'help' url patterns for class.
+ * All urls matching the pattern will be crawled and but not extracted for new objects.
+ * * @author code4crafter@gmail.com
- * Date: 13-8-1
- * Time: 下午8:40
+ * @since 0.2.0 */ @Retention(java.lang.annotation.RetentionPolicy.RUNTIME) @Target({ElementType.TYPE}) public @interface HelpUrl { /** - * 某个类对应的URL规则列表
- * webmagic对正则表达式进行了修改,"."仅表示字符"."而不代表任意字符,而"\*"则代表了".\*",例如"http://\*.oschina.net/\*"代表了oschina所有的二级域名下的URL。
+ * The url patterns to crawl.
+ * Use regex expression with some changes:
+ * "." stand for literal character "." instead of "any character".
+ * "*" stand for any legal character for url in 0-n length ([^"'#]*) instead of "any length".
* - * @return 抽取规则 + * @return the url patterns for class */ String[] value(); /** - * 指定提取URL的区域(仅支持XPath) - * @return 指定提取URL的区域 + * Define the region for url extracting.
+ * Only support XPath.
+ * When sourceRegion is set, the urls will be extracted only from the region instead of entire content.
+ * + * @return the region for url extracting */ String sourceRegion() default ""; } diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/TargetUrl.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/TargetUrl.java index e12fca39..3a8ab559 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/TargetUrl.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/TargetUrl.java @@ -5,27 +5,32 @@ import java.lang.annotation.Retention; import java.lang.annotation.Target; /** - * 定义某个类抽取的范围和来源,sourceRegion可以用xpath语法限定抽取区域。
+ * Define the url patterns for class.
+ * All urls matching the pattern will be crawled and extracted for new objects.
* * @author code4crafter@gmail.com
- * Date: 13-8-1
- * Time: 下午8:40
+ * @since 0.2.0 */ @Retention(java.lang.annotation.RetentionPolicy.RUNTIME) @Target({ElementType.TYPE}) public @interface TargetUrl { /** - * 某个类对应的URL规则列表
- * webmagic对正则表达式进行了修改,"."仅表示字符"."而不代表任意字符,而"\*"则代表了".\*",例如"http://\*.oschina.net/\*"代表了oschina所有的二级域名下的URL。
+ * The url patterns for class.
+ * Use regex expression with some changes:
+ * "." stand for literal character "." instead of "any character".
+ * "*" stand for any legal character for url in 0-n length ([^"'#]*) instead of "any length".
* - * @return 抽取规则 + * @return the url patterns for class */ String[] value(); /** - * 指定提取URL的区域(仅支持XPath) - * @return 指定提取URL的区域 + * Define the region for url extracting.
+ * Only support XPath.
+ * When sourceRegion is set, the urls will be extracted only from the region instead of entire content.
+ * + * @return the region for url extracting */ String sourceRegion() default ""; diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/package.html b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/package.html index 1e3004fb..4e213f7d 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/package.html +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/package.html @@ -1,5 +1,5 @@ -webmagic注解抓取方式所定义的注解。 +Annotations for define a class. diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/utils/ExtractorUtils.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/utils/ExtractorUtils.java new file mode 100644 index 00000000..5c6ebbf8 --- /dev/null +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/utils/ExtractorUtils.java @@ -0,0 +1,48 @@ +package us.codecraft.webmagic.utils; + +import us.codecraft.webmagic.model.annotation.ExtractBy; +import us.codecraft.webmagic.selector.CssSelector; +import us.codecraft.webmagic.selector.RegexSelector; +import us.codecraft.webmagic.selector.Selector; +import us.codecraft.webmagic.selector.XpathSelector; + +import java.util.ArrayList; +import java.util.List; + +/** + * Tools for annotation converting.
+ * @author code4crafter@gmail.com
+ * @since 0.2.1 + */ +public class ExtractorUtils { + + public static Selector getSelector(ExtractBy extractBy) { + String value = extractBy.value(); + Selector selector; + switch (extractBy.type()) { + case Css: + selector = new CssSelector(value); + break; + case Regex: + selector = new RegexSelector(value); + break; + case XPath: + selector = new XpathSelector(value); + break; + default: + selector = new XpathSelector(value); + } + return selector; + } + + public static List getSelectors(ExtractBy[] extractBies) { + List selectors = new ArrayList(); + if (extractBies==null){ + return selectors; + } + for (ExtractBy extractBy : extractBies) { + selectors.add(getSelector(extractBy)); + } + return selectors; + } +}