add extract by url

pull/17/head
yihua.huang 12 years ago
parent f08ffc34fd
commit abba3b7bff

@ -0,0 +1,18 @@
package us.codecraft.webmagic.annotation;
import java.lang.annotation.ElementType;
import java.lang.annotation.Retention;
import java.lang.annotation.Target;
/**
* @author yihua.huang@dianping.com <br>
* @date: 13-8-1 <br>
* Time: 8:40 <br>
*/
@Retention(java.lang.annotation.RetentionPolicy.RUNTIME)
@Target({ElementType.FIELD})
public @interface ExtractByUrl {
String value() default "";
}

@ -15,9 +15,20 @@ class FieldExtractor {
private final Selector selector; private final Selector selector;
FieldExtractor(Field field, Selector selector) { private final Source source;
static enum Source {Html, Url}
public FieldExtractor(Field field, Selector selector) {
this.field = field;
this.selector = selector;
this.source = Source.Html;
}
public FieldExtractor(Field field, Selector selector, Source source) {
this.field = field; this.field = field;
this.selector = selector; this.selector = selector;
this.source = source;
} }
Field getField() { Field getField() {
@ -27,4 +38,8 @@ class FieldExtractor {
Selector getSelector() { Selector getSelector() {
return selector; return selector;
} }
Source getSource() {
return source;
}
} }

@ -47,6 +47,7 @@ public class ObjectPageProcessor implements PageProcessor {
public void process(Page page) { public void process(Page page) {
for (PageModelExtractor pageModelExtractor : pageModelExtractorList) { for (PageModelExtractor pageModelExtractor : pageModelExtractorList) {
Object process = pageModelExtractor.process(page); Object process = pageModelExtractor.process(page);
postProcessPageModel(pageModelExtractor.getClazz(), process);
page.putField(pageModelExtractor.getClazz().getCanonicalName(), process); page.putField(pageModelExtractor.getClazz().getCanonicalName(), process);
} }
for (String link : page.getHtml().links().all()) { for (String link : page.getHtml().links().all()) {
@ -58,6 +59,9 @@ public class ObjectPageProcessor implements PageProcessor {
} }
} }
protected void postProcessPageModel(Class clazz, Object object){
}
@Override @Override
public Site getSite() { public Site getSite() {
return site; return site;

@ -38,22 +38,32 @@ class PageModelExtractor {
for (Field field : clazz.getDeclaredFields()) { for (Field field : clazz.getDeclaredFields()) {
field.setAccessible(true); field.setAccessible(true);
ExtractBy extractBy = field.getAnnotation(ExtractBy.class); ExtractBy extractBy = field.getAnnotation(ExtractBy.class);
String value = extractBy.value(); if (extractBy != null) {
Selector selector; String value = extractBy.value();
switch (extractBy.type()) { Selector selector;
case Css: switch (extractBy.type()) {
selector = new CssSelector(value); case Css:
break; selector = new CssSelector(value);
case Regex: break;
selector = new RegexSelector(value); case Regex:
break; selector = new RegexSelector(value);
case XPath: break;
selector = new XpathSelector(value); case XPath:
break; selector = new XpathSelector(value);
default: break;
selector = new XpathSelector(value); default:
selector = new XpathSelector(value);
}
fieldExtractors.add(new FieldExtractor(field, selector));
}
ExtractByUrl extractByUrl = field.getAnnotation(ExtractByUrl.class);
if (extractByUrl != null) {
String regexPattern = extractByUrl.value();
if (regexPattern.trim().equals("")) {
regexPattern = ".*";
}
fieldExtractors.add(new FieldExtractor(field, new RegexSelector(regexPattern), FieldExtractor.Source.Url));
} }
fieldExtractors.add(new FieldExtractor(field, selector));
} }
} }
@ -65,7 +75,7 @@ class PageModelExtractor {
} else { } else {
String[] value = ((TargetUrl) annotation).value(); String[] value = ((TargetUrl) annotation).value();
for (String s : value) { for (String s : value) {
targetUrlPatterns.add(Pattern.compile(s.replace(".","\\.").replace("*","[^\"'#]*"))); targetUrlPatterns.add(Pattern.compile(s.replace(".", "\\.").replace("*", "[^\"'#]*")));
} }
} }
} }
@ -84,7 +94,15 @@ class PageModelExtractor {
try { try {
o = clazz.newInstance(); o = clazz.newInstance();
for (FieldExtractor fieldExtractor : fieldExtractors) { for (FieldExtractor fieldExtractor : fieldExtractors) {
fieldExtractor.getField().set(o, fieldExtractor.getSelector().select(page.getHtml().toString())); switch (fieldExtractor.getSource()) {
case Html:
fieldExtractor.getField().set(o, fieldExtractor.getSelector().select(page.getHtml().toString()));
break;
case Url:
fieldExtractor.getField().set(o, fieldExtractor.getSelector().select(page.getUrl().toString()));
break;
}
} }
} catch (InstantiationException e) { } catch (InstantiationException e) {
e.printStackTrace(); e.printStackTrace();

@ -6,7 +6,7 @@ package us.codecraft.webmagic.annotation;
* Time: 10:18 <br> * Time: 10:18 <br>
*/ */
@TargetUrl("http://my.oschina.net/flashsword/blog/*") @TargetUrl("http://my.oschina.net/flashsword/blog/*")
public class Blog { public class OschinaBlog {
@ExtractBy("//title") @ExtractBy("//title")
private String title; private String title;
@ -16,7 +16,7 @@ public class Blog {
@Override @Override
public String toString() { public String toString() {
return "Blog{" + return "OschinaBlog{" +
"title='" + title + '\'' + "title='" + title + '\'' +
", content='" + content + '\'' + ", content='" + content + '\'' +
'}'; '}';

@ -15,7 +15,7 @@ public class TestFetcher {
@Ignore("takes long") @Ignore("takes long")
@Test @Test
public void test() { public void test() {
Spider.create(ObjectPageProcessor.create(Site.me().addStartUrl("http://my.oschina.net/flashsword/blog/145796"), Blog.class)).run(); Spider.create(ObjectPageProcessor.create(Site.me().addStartUrl("http://my.oschina.net/flashsword/blog/145796"), OschinaBlog.class)).run();
} }

@ -1168,7 +1168,7 @@ public class XpathSelectorTest {
+ " var location = window.location;\n" + " var location = window.location;\n"
+ " source_url = location.protocol + \"//\" + location.host + location.pathname + location.search;\n" + " source_url = location.protocol + \"//\" + location.host + location.pathname + location.search;\n"
+ " pre.writeAttribute('codeable_id', post_id);\n" + " pre.writeAttribute('codeable_id', post_id);\n"
+ " pre.writeAttribute('codeable_type', \"Blog\");\n" + " pre.writeAttribute('codeable_type', \"OschinaBlog\");\n"
+ " pre.writeAttribute('source_url', source_url);\n" + " pre.writeAttribute('source_url', source_url);\n"
+ " pre.writeAttribute('pre_index', index);\n" + " pre.writeAttribute('pre_index', index);\n"
+ " pre.writeAttribute('title', 'jsoup 解析页面商品信息');\n" + " pre.writeAttribute('title', 'jsoup 解析页面商品信息');\n"

Loading…
Cancel
Save