add annotation ExtractByRaw

pull/17/head
yihua.huang 12 years ago
parent 1a50c64e33
commit a5c85c3c8b

@ -0,0 +1,27 @@
package us.codecraft.webmagic.model;
import java.lang.annotation.ElementType;
import java.lang.annotation.Retention;
import java.lang.annotation.Target;
/**
* Class使ExtractBy使<br>
* @author code4crafter@gmail.com <br>
* @date: 13-8-1 <br>
* Time: 8:40 <br>
*/
@Retention(java.lang.annotation.RetentionPolicy.RUNTIME)
@Target({ElementType.FIELD,ElementType.TYPE})
public @interface ExtractByRaw {
String value();
public enum Type {XPath2, XPath, Regex, Css}
Type type() default Type.XPath2;
boolean notNull() default true;
boolean multi() default false;
}

@ -17,7 +17,7 @@ class Extractor {
protected final boolean multi; protected final boolean multi;
static enum Source {Html, Url} static enum Source {Html, Url, RawHtml}
public Extractor(Selector selector, Source source, boolean notNull, boolean multi) { public Extractor(Selector selector, Source source, boolean notNull, boolean multi) {
this.selector = selector; this.selector = selector;

@ -46,56 +46,100 @@ class PageModelExtractor {
fieldExtractors = new ArrayList<FieldExtractor>(); fieldExtractors = new ArrayList<FieldExtractor>();
for (Field field : clazz.getDeclaredFields()) { for (Field field : clazz.getDeclaredFields()) {
field.setAccessible(true); field.setAccessible(true);
ExtractBy extractBy = field.getAnnotation(ExtractBy.class); getAnnotationExtractBy(clazz, field);
if (extractBy != null) { getAnnotationExtractByRaw(clazz,field);
if (!extractBy.multi() && !String.class.isAssignableFrom(field.getType())) { getAnnotationExtractByUrl(clazz, field);
throw new IllegalStateException("Field " + field.getName() + " must be string"); }
} else if (extractBy.multi() && !List.class.isAssignableFrom(field.getType())) { }
throw new IllegalStateException("Field " + field.getName() + " must be list");
} private void getAnnotationExtractByUrl(Class clazz, Field field) {
String value = extractBy.value(); ExtractByUrl extractByUrl = field.getAnnotation(ExtractByUrl.class);
Selector selector; if (extractByUrl != null) {
switch (extractBy.type()) { if (!extractByUrl.multi() && !String.class.isAssignableFrom(field.getType())) {
case Css: throw new IllegalStateException("Field " + field.getName() + " must be string");
selector = new CssSelector(value); } else if (extractByUrl.multi() && !List.class.isAssignableFrom(field.getType())) {
break; throw new IllegalStateException("Field " + field.getName() + " must be list");
case Regex: }
selector = new RegexSelector(value); String regexPattern = extractByUrl.value();
break; if (regexPattern.trim().equals("")) {
case XPath: regexPattern = ".*";
selector = new XpathSelector(value);
break;
case XPath2:
selector = new Xpath2Selector(value);
break;
default:
selector = new Xpath2Selector(value);
}
FieldExtractor fieldExtractor = new FieldExtractor(field, selector, FieldExtractor.Source.Html, extractBy.notNull(), extractBy.multi());
Method setterMethod = getSetterMethod(clazz, field);
if (setterMethod != null) {
fieldExtractor.setSetterMethod(setterMethod);
}
fieldExtractors.add(fieldExtractor);
}
ExtractByUrl extractByUrl = field.getAnnotation(ExtractByUrl.class);
if (extractByUrl != null) {
if (!extractByUrl.multi() && !String.class.isAssignableFrom(field.getType())) {
throw new IllegalStateException("Field " + field.getName() + " must be string");
} else if (extractByUrl.multi() && !List.class.isAssignableFrom(field.getType())) {
throw new IllegalStateException("Field " + field.getName() + " must be list");
}
String regexPattern = extractByUrl.value();
if (regexPattern.trim().equals("")) {
regexPattern = ".*";
}
FieldExtractor fieldExtractor = new FieldExtractor(field, new RegexSelector(regexPattern), FieldExtractor.Source.Url, extractByUrl.notNull(), extractByUrl.multi());
Method setterMethod = getSetterMethod(clazz, field);
if (setterMethod != null) {
fieldExtractor.setSetterMethod(setterMethod);
}
fieldExtractors.add(fieldExtractor);
} }
FieldExtractor fieldExtractor = new FieldExtractor(field, new RegexSelector(regexPattern), FieldExtractor.Source.Url, extractByUrl.notNull(), extractByUrl.multi());
Method setterMethod = getSetterMethod(clazz, field);
if (setterMethod != null) {
fieldExtractor.setSetterMethod(setterMethod);
}
fieldExtractors.add(fieldExtractor);
}
}
private void getAnnotationExtractBy(Class clazz, Field field) {
ExtractBy extractBy = field.getAnnotation(ExtractBy.class);
if (extractBy != null) {
if (!extractBy.multi() && !String.class.isAssignableFrom(field.getType())) {
throw new IllegalStateException("Field " + field.getName() + " must be string");
} else if (extractBy.multi() && !List.class.isAssignableFrom(field.getType())) {
throw new IllegalStateException("Field " + field.getName() + " must be list");
}
String value = extractBy.value();
Selector selector;
switch (extractBy.type()) {
case Css:
selector = new CssSelector(value);
break;
case Regex:
selector = new RegexSelector(value);
break;
case XPath:
selector = new XpathSelector(value);
break;
case XPath2:
selector = new Xpath2Selector(value);
break;
default:
selector = new Xpath2Selector(value);
}
FieldExtractor fieldExtractor = new FieldExtractor(field, selector, FieldExtractor.Source.Html, extractBy.notNull(), extractBy.multi());
Method setterMethod = getSetterMethod(clazz, field);
if (setterMethod != null) {
fieldExtractor.setSetterMethod(setterMethod);
}
fieldExtractors.add(fieldExtractor);
}
}
private void getAnnotationExtractByRaw(Class clazz, Field field) {
ExtractByRaw extractByRaw = field.getAnnotation(ExtractByRaw.class);
if (extractByRaw != null) {
if (!extractByRaw.multi() && !String.class.isAssignableFrom(field.getType())) {
throw new IllegalStateException("Field " + field.getName() + " must be string");
} else if (extractByRaw.multi() && !List.class.isAssignableFrom(field.getType())) {
throw new IllegalStateException("Field " + field.getName() + " must be list");
}
String value = extractByRaw.value();
Selector selector;
switch (extractByRaw.type()) {
case Css:
selector = new CssSelector(value);
break;
case Regex:
selector = new RegexSelector(value);
break;
case XPath:
selector = new XpathSelector(value);
break;
case XPath2:
selector = new Xpath2Selector(value);
break;
default:
selector = new Xpath2Selector(value);
}
FieldExtractor fieldExtractor = new FieldExtractor(field, selector, FieldExtractor.Source.RawHtml, extractByRaw.notNull(), extractByRaw.multi());
Method setterMethod = getSetterMethod(clazz, field);
if (setterMethod != null) {
fieldExtractor.setSetterMethod(setterMethod);
}
fieldExtractors.add(fieldExtractor);
} }
} }
@ -181,6 +225,9 @@ class PageModelExtractor {
if (fieldExtractor.multi) { if (fieldExtractor.multi) {
List<String> value; List<String> value;
switch (fieldExtractor.getSource()) { switch (fieldExtractor.getSource()) {
case RawHtml:
value = fieldExtractor.getSelector().selectList(page.getHtml().toString());
break;
case Html: case Html:
value = fieldExtractor.getSelector().selectList(html); value = fieldExtractor.getSelector().selectList(html);
break; break;
@ -197,6 +244,9 @@ class PageModelExtractor {
} else { } else {
String value; String value;
switch (fieldExtractor.getSource()) { switch (fieldExtractor.getSource()) {
case RawHtml:
value = fieldExtractor.getSelector().select(page.getHtml().toString());
break;
case Html: case Html:
value = fieldExtractor.getSelector().select(html); value = fieldExtractor.getSelector().select(html);
break; break;

@ -22,6 +22,9 @@ public class OschinaBlog implements AfterExtractor {
@ExtractBy(value = "//div[@class='BlogTags']/a/text()", multi = true) @ExtractBy(value = "//div[@class='BlogTags']/a/text()", multi = true)
private List<String> tags; private List<String> tags;
@ExtractBy(value = "//div[@class='BlogTags']/a/text()", multi = true)
private List<String> comments;
@Override @Override
public void afterProcess(Page page) { public void afterProcess(Page page) {
System.out.println("title:\t"+title); System.out.println("title:\t"+title);

@ -0,0 +1,13 @@
package us.codecraft.webmagic.model;
/**
* @author code4crafter@gmail.com <br>
* @date: 13-8-1 <br>
* Time: 10:18 <br>
*/
@TargetUrl("http://my.oschina.net/flashsword/blog/*")
public class OschinaBlogComment {
}
Loading…
Cancel
Save