add combo extract to replace Extract2 Extract3...

pull/17/head
yihua.huang 12 years ago
parent f946fcdfea
commit 3ba7a76f44

@ -5,8 +5,7 @@ import java.util.List;
/**
* @author code4crafter@gmail.com <br>
* Date: 13-8-3 <br>
* Time: 5:29 <br>
* @since 0.2.0
*/
public class AndSelector implements Selector {
@ -18,6 +17,10 @@ public class AndSelector implements Selector {
}
}
public AndSelector(List<Selector> selectors) {
this.selectors = selectors;
}
@Override
public String select(String text) {
for (Selector selector : selectors) {

@ -5,8 +5,7 @@ import java.util.List;
/**
* @author code4crafter@gmail.com <br>
* Date: 13-8-3 <br>
* Time: 5:29 <br>
* @since 0.2.0
*/
public class OrSelector implements Selector {
@ -18,11 +17,15 @@ public class OrSelector implements Selector {
}
}
public OrSelector(List<Selector> selectors) {
this.selectors = selectors;
}
@Override
public String select(String text) {
for (Selector selector : selectors) {
text = selector.select(text);
if (text!=null){
if (text != null) {
return text;
}
}

@ -4,6 +4,7 @@ import org.apache.commons.lang3.StringUtils;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.model.annotation.*;
import us.codecraft.webmagic.selector.*;
import us.codecraft.webmagic.utils.ExtractorUtils;
import java.lang.annotation.Annotation;
import java.lang.reflect.Field;
@ -49,20 +50,15 @@ class PageModelExtractor {
for (Field field : clazz.getDeclaredFields()) {
field.setAccessible(true);
FieldExtractor fieldExtractor = getAnnotationExtractBy(clazz, field);
FieldExtractor fieldExtractorTmp = getAnnotationExtractByRaw(clazz, field);
FieldExtractor fieldExtractorTmp = getAnnotationExtractCombo(clazz, field);
if (fieldExtractor != null && fieldExtractorTmp != null) {
throw new IllegalStateException("Only one of 'ExtractBy ExtractByRaw ExtractByUrl' can be added to a field!");
throw new IllegalStateException("Only one of 'ExtractBy ComboExtract ExtractByUrl' can be added to a field!");
} else if (fieldExtractor == null && fieldExtractorTmp != null) {
fieldExtractor = fieldExtractorTmp;
}
// ExtractBy2 & ExtractBy3
if (fieldExtractor!=null){
addAnnotationExtractBy2(fieldExtractor);
addAnnotationExtractBy3(fieldExtractor);
}
fieldExtractorTmp = getAnnotationExtractByUrl(clazz, field);
if (fieldExtractor != null && fieldExtractorTmp != null) {
throw new IllegalStateException("Only one of 'ExtractBy ExtractByRaw ExtractByUrl' can be added to a field!");
throw new IllegalStateException("Only one of 'ExtractBy ComboExtract ExtractByUrl' can be added to a field!");
} else if (fieldExtractor == null && fieldExtractorTmp != null) {
fieldExtractor = fieldExtractorTmp;
}
@ -94,26 +90,23 @@ class PageModelExtractor {
return fieldExtractor;
}
private FieldExtractor getAnnotationExtractBy(Class clazz, Field field) {
private FieldExtractor getAnnotationExtractCombo(Class clazz, Field field) {
FieldExtractor fieldExtractor = null;
ExtractBy extractBy = field.getAnnotation(ExtractBy.class);
if (extractBy != null) {
String value = extractBy.value();
ComboExtract comboExtract = field.getAnnotation(ComboExtract.class);
if (comboExtract != null) {
ExtractBy[] extractBies = comboExtract.value();
Selector selector;
switch (extractBy.type()) {
case Css:
selector = new CssSelector(value);
break;
case Regex:
selector = new RegexSelector(value);
switch (comboExtract.op()) {
case And:
selector = new AndSelector(ExtractorUtils.getSelectors(extractBies));
break;
case XPath:
selector = new XpathSelector(value);
case Or:
selector = new OrSelector(ExtractorUtils.getSelectors(extractBies));
break;
default:
selector = new XpathSelector(value);
selector = new AndSelector(ExtractorUtils.getSelectors(extractBies));
}
fieldExtractor = new FieldExtractor(field, selector, FieldExtractor.Source.Html, extractBy.notNull(), extractBy.multi());
fieldExtractor = new FieldExtractor(field, selector, FieldExtractor.Source.Html, comboExtract.notNull(), comboExtract.multi());
Method setterMethod = getSetterMethod(clazz, field);
if (setterMethod != null) {
fieldExtractor.setSetterMethod(setterMethod);
@ -122,70 +115,12 @@ class PageModelExtractor {
return fieldExtractor;
}
private void addAnnotationExtractBy2(FieldExtractor fieldExtractor) {
ExtractBy2 extractBy = fieldExtractor.getField().getAnnotation(ExtractBy2.class);
if (extractBy != null) {
String value = extractBy.value();
Selector selector;
switch (extractBy.type()) {
case Css:
selector = new CssSelector(value);
break;
case Regex:
selector = new RegexSelector(value);
break;
case XPath:
selector = new XpathSelector(value);
break;
default:
selector = new XpathSelector(value);
}
fieldExtractor.setSelector(new AndSelector(fieldExtractor.getSelector(), selector));
}
}
private void addAnnotationExtractBy3(FieldExtractor fieldExtractor) {
ExtractBy3 extractBy = fieldExtractor.getField().getAnnotation(ExtractBy3.class);
if (extractBy != null) {
String value = extractBy.value();
Selector selector;
switch (extractBy.type()) {
case Css:
selector = new CssSelector(value);
break;
case Regex:
selector = new RegexSelector(value);
break;
case XPath:
selector = new XpathSelector(value);
break;
default:
selector = new XpathSelector(value);
}
fieldExtractor.setSelector(new AndSelector(fieldExtractor.getSelector(), selector));
}
}
private FieldExtractor getAnnotationExtractByRaw(Class clazz, Field field) {
private FieldExtractor getAnnotationExtractBy(Class clazz, Field field) {
FieldExtractor fieldExtractor = null;
ExtractByRaw extractByRaw = field.getAnnotation(ExtractByRaw.class);
if (extractByRaw != null) {
String value = extractByRaw.value();
Selector selector;
switch (extractByRaw.type()) {
case Css:
selector = new CssSelector(value);
break;
case Regex:
selector = new RegexSelector(value);
break;
case XPath:
selector = new XpathSelector(value);
break;
default:
selector = new XpathSelector(value);
}
fieldExtractor = new FieldExtractor(field, selector, FieldExtractor.Source.RawHtml, extractByRaw.notNull(), extractByRaw.multi());
ExtractBy extractBy = field.getAnnotation(ExtractBy.class);
if (extractBy != null) {
Selector selector = ExtractorUtils.getSelector(extractBy);
fieldExtractor = new FieldExtractor(field, selector, FieldExtractor.Source.Html, extractBy.notNull(), extractBy.multi());
Method setterMethod = getSetterMethod(clazz, field);
if (setterMethod != null) {
fieldExtractor.setSetterMethod(setterMethod);

@ -5,14 +5,75 @@ import java.lang.annotation.Retention;
import java.lang.annotation.Target;
/**
* Combo 'ExtractBy' extractor with and/or operator.
*
* @author code4crafter@gmail.com <br>
* Date: 13-8-16 <br>
* Time: 11:09 <br>
* @since 0.2.1
*/
@Retention(java.lang.annotation.RetentionPolicy.RUNTIME)
@Target({ElementType.FIELD, ElementType.TYPE})
public @interface ComboExtract {
/**
* The extractors to be combined.
*
* @return the extractors to be combined
*/
ExtractBy[] value();
enum Op {
/**
* All extractors will be arranged as a pipeline. <br>
* The next extractor uses the result of the previous as source.
*/
And,
/**
* All extractors will do extracting separately, <br>
* and the results of extractors will combined as the final result.
*/
Or;
}
/**
* Combining operation of extractors.<br>
*
* @return combining operation of extractors
*/
Op op() default Op.And;
/**
* Define whether the field can be null.<br>
* If set to 'true' and the extractor get no result, the entire class will be discarded. <br>
*
* @return whether the field can be null
*/
boolean notNull() default false;
public enum Source {
/**
* extract from the content extracted by class extractor
*/
SelectedHtml,
/**
* extract from the raw html
*/
RawHtml
}
/**
* The source for extracting. <br>
* It works only if you already added 'ExtractBy' to Class. <br>
*
* @return the source for extracting
*/
Source source() default Source.SelectedHtml;
/**
* Define whether the extractor return more than one result.
* When set to 'true', the extractor return a list of string (so you should define the field as List). <br>
*
* @return whether the extractor return more than one result
*/
boolean multi() default false;
}

@ -5,45 +5,63 @@ import java.lang.annotation.Retention;
import java.lang.annotation.Target;
/**
* <br>
* Define the extractor for field or class<br>
*
* @author code4crafter@gmail.com <br>
* Date: 13-8-1 <br>
* Time: 8:40 <br>
* @since 0.2.0
*/
@Retention(java.lang.annotation.RetentionPolicy.RUNTIME)
@Target({ElementType.FIELD, ElementType.TYPE})
public @interface ExtractBy {
/**
*
* Extractor expression, support XPath, CSS Selector and regex.
*
* @return
* @return extractor expression
*/
String value();
public enum Type {XPath, Regex, Css}
/**
* XPathCss selectorXPath
* Extractor type, support XPath, CSS Selector and regex.
*
* @return
* @return extractor type
*/
Type type() default Type.XPath;
/**
* notNulltruefalse
* Define whether the field can be null.<br>
* If set to 'true' and the extractor get no result, the entire class will be discarded. <br>
*
* @return
* @return whether the field can be null
*/
boolean notNull() default false;
public enum Source {
/**
* extract from the content extracted by class extractor
*/
SelectedHtml,
/**
* extract from the raw html
*/
RawHtml
}
/**
* The source for extracting. <br>
* It works only if you already added 'ExtractBy' to Class. <br>
*
* @return the source for extracting
*/
Source source() default Source.SelectedHtml;
/**
* <br>
* List<String><br>
* <br>
* Define whether the extractor return more than one result.
* When set to 'true', the extractor return a list of string (so you should define the field as List). <br>
*
* @return
* @return whether the extractor return more than one result
*/
boolean multi() default false;

@ -1,24 +0,0 @@
package us.codecraft.webmagic.model.annotation;
import java.lang.annotation.ElementType;
import java.lang.annotation.Retention;
import java.lang.annotation.Target;
/**
* ExtractExtractByRaw使<br>
*
* @author code4crafter@gmail.com <br>
* Date: 13-8-1 <br>
* Time: 8:40 <br>
*/
@Retention(java.lang.annotation.RetentionPolicy.RUNTIME)
@Target({ElementType.FIELD})
public @interface ExtractBy2 {
String value();
public enum Type {XPath, Regex, Css}
Type type() default Type.XPath;
}

@ -1,23 +0,0 @@
package us.codecraft.webmagic.model.annotation;
import java.lang.annotation.ElementType;
import java.lang.annotation.Retention;
import java.lang.annotation.Target;
/**
* ExtractExtractByRaw使<br>
* @author code4crafter@gmail.com <br>
* Date: 13-8-1 <br>
* Time: 8:40 <br>
*/
@Retention(java.lang.annotation.RetentionPolicy.RUNTIME)
@Target({ElementType.FIELD})
public @interface ExtractBy3 {
String value();
public enum Type { XPath, Regex, Css}
Type type() default Type.XPath;
}

@ -1,49 +0,0 @@
package us.codecraft.webmagic.model.annotation;
import java.lang.annotation.ElementType;
import java.lang.annotation.Retention;
import java.lang.annotation.Target;
/**
* Class使ExtractBy使<br>
*
* @author code4crafter@gmail.com <br>
* Date: 13-8-1 <br>
* Time: 8:40 <br>
*/
@Retention(java.lang.annotation.RetentionPolicy.RUNTIME)
@Target({ElementType.FIELD, ElementType.TYPE})
public @interface ExtractByRaw {
/**
*
*
* @return
*/
String value();
public enum Type {XPath, Regex, Css}
/**
* XPathCss selectorXPath
*
* @return
*/
Type type() default Type.XPath;
/**
* notNulltruefalse
*
* @return
*/
boolean notNull() default false;
/**
* <br>
* List<String><br>
*
* @return
*/
boolean multi() default false;
}

@ -5,35 +5,35 @@ import java.lang.annotation.Retention;
import java.lang.annotation.Target;
/**
* (url)<br>
* Define a extractor for url. Only regex can be used. <br>
*
* @author code4crafter@gmail.com <br>
* Date: 13-8-1 <br>
* Time: 8:40 <br>
* @since 0.2.0
*/
@Retention(java.lang.annotation.RetentionPolicy.RUNTIME)
@Target({ElementType.FIELD})
public @interface ExtractByUrl{
public @interface ExtractByUrl {
/**
*
* Extractor expression, only regex can be used
*
* @return
* @return extractor expression
*/
String value() default "";
/**
* notNulltruefalse
* Define whether the field can be null.<br>
* If set to 'true' and the extractor get no result, the entire class will be discarded. <br>
*
* @return
* @return whether the field can be null
*/
boolean notNull() default false;
/**
* <br>
* List<String><br>
* <br>
* Define whether the extractor return more than one result.
* When set to 'true', the extractor return a list of string (so you should define the field as List). <br>
*
* @return
* @return whether the extractor return more than one result
*/
boolean multi() default false;

@ -5,26 +5,32 @@ import java.lang.annotation.Retention;
import java.lang.annotation.Target;
/**
* url<br>
* Define the 'help' url patterns for class. <br>
* All urls matching the pattern will be crawled and but not extracted for new objects. <br>
*
* @author code4crafter@gmail.com <br>
* Date: 13-8-1 <br>
* Time: 8:40 <br>
* @since 0.2.0
*/
@Retention(java.lang.annotation.RetentionPolicy.RUNTIME)
@Target({ElementType.TYPE})
public @interface HelpUrl {
/**
* URL<br>
* webmagic"."".""\*"".\*""http://\*.oschina.net/\*"oschinaURL<br>
* The url patterns to crawl. <br>
* Use regex expression with some changes: <br>
* "." stand for literal character "." instead of "any character". <br>
* "*" stand for any legal character for url in 0-n length ([^"'#]*) instead of "any length". <br>
*
* @return
* @return the url patterns for class
*/
String[] value();
/**
* URL(XPath)
* @return URL
* Define the region for url extracting. <br>
* Only support XPath.<br>
* When sourceRegion is set, the urls will be extracted only from the region instead of entire content. <br>
*
* @return the region for url extracting
*/
String sourceRegion() default "";
}

@ -5,27 +5,32 @@ import java.lang.annotation.Retention;
import java.lang.annotation.Target;
/**
* sourceRegionxpath<br>
* Define the url patterns for class. <br>
* All urls matching the pattern will be crawled and extracted for new objects. <br>
*
* @author code4crafter@gmail.com <br>
* Date: 13-8-1 <br>
* Time: 8:40 <br>
* @since 0.2.0
*/
@Retention(java.lang.annotation.RetentionPolicy.RUNTIME)
@Target({ElementType.TYPE})
public @interface TargetUrl {
/**
* URL<br>
* webmagic"."".""\*"".\*""http://\*.oschina.net/\*"oschinaURL<br>
* The url patterns for class.<br>
* Use regex expression with some changes: <br>
* "." stand for literal character "." instead of "any character". <br>
* "*" stand for any legal character for url in 0-n length ([^"'#]*) instead of "any length". <br>
*
* @return
* @return the url patterns for class
*/
String[] value();
/**
* URL(XPath)
* @return URL
* Define the region for url extracting. <br>
* Only support XPath.<br>
* When sourceRegion is set, the urls will be extracted only from the region instead of entire content. <br>
*
* @return the region for url extracting
*/
String sourceRegion() default "";

@ -1,5 +1,5 @@
<html>
<body>
webmagic注解抓取方式所定义的注解。
Annotations for define a class.
</body>
</html>

@ -0,0 +1,48 @@
package us.codecraft.webmagic.utils;
import us.codecraft.webmagic.model.annotation.ExtractBy;
import us.codecraft.webmagic.selector.CssSelector;
import us.codecraft.webmagic.selector.RegexSelector;
import us.codecraft.webmagic.selector.Selector;
import us.codecraft.webmagic.selector.XpathSelector;
import java.util.ArrayList;
import java.util.List;
/**
* Tools for annotation converting. <br>
* @author code4crafter@gmail.com <br>
* @since 0.2.1
*/
public class ExtractorUtils {
public static Selector getSelector(ExtractBy extractBy) {
String value = extractBy.value();
Selector selector;
switch (extractBy.type()) {
case Css:
selector = new CssSelector(value);
break;
case Regex:
selector = new RegexSelector(value);
break;
case XPath:
selector = new XpathSelector(value);
break;
default:
selector = new XpathSelector(value);
}
return selector;
}
public static List<Selector> getSelectors(ExtractBy[] extractBies) {
List<Selector> selectors = new ArrayList<Selector>();
if (extractBies==null){
return selectors;
}
for (ExtractBy extractBy : extractBies) {
selectors.add(getSelector(extractBy));
}
return selectors;
}
}
Loading…
Cancel
Save