From 145628557d39d162fc801c86f2f237a8d36f08f5 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Sat, 3 Aug 2013 18:01:17 +0800 Subject: [PATCH] update afterextract api --- .../codecraft/webmagic/oo/AfterExtractor.java | 4 +- .../webmagic/oo/ObjectPageProcessor.java | 28 +++++++++---- .../webmagic/oo/PageModelExtractor.java | 41 ++++++++++++------- .../us/codecraft/webmagic/oo/OschinaBlog.java | 6 +-- 4 files changed, 52 insertions(+), 27 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/oo/AfterExtractor.java b/webmagic-core/src/main/java/us/codecraft/webmagic/oo/AfterExtractor.java index cb9788ba..79feaaf3 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/oo/AfterExtractor.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/oo/AfterExtractor.java @@ -9,7 +9,7 @@ import us.codecraft.webmagic.Page; * @date: 13-8-3
* Time: 上午9:42
*/ -public interface AfterExtractor { +public interface AfterExtractor { - public void afterProcess(Page page, T t); + public void afterProcess(Page page); } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/oo/ObjectPageProcessor.java b/webmagic-core/src/main/java/us/codecraft/webmagic/oo/ObjectPageProcessor.java index dda96b56..c280acd5 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/oo/ObjectPageProcessor.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/oo/ObjectPageProcessor.java @@ -4,11 +4,13 @@ import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Request; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.processor.PageProcessor; +import us.codecraft.webmagic.selector.Selector; import java.util.ArrayList; import java.util.HashSet; import java.util.List; import java.util.Set; +import java.util.regex.Matcher; import java.util.regex.Pattern; /** @@ -33,7 +35,7 @@ public class ObjectPageProcessor implements PageProcessor { } - public ObjectPageProcessor addPageModel(Class clazz){ + public ObjectPageProcessor addPageModel(Class clazz) { PageModelExtractor pageModelExtractor = PageModelExtractor.create(clazz); targetUrlPatterns.addAll(pageModelExtractor.getTargetUrlPatterns()); targetUrlPatterns.addAll(pageModelExtractor.getHelpUrlPatterns()); @@ -49,22 +51,34 @@ public class ObjectPageProcessor implements PageProcessor { public void process(Page page) { for (PageModelExtractor pageModelExtractor : pageModelExtractorList) { Object process = pageModelExtractor.process(page); - if (process==null){ + if (process == null) { page.getResultItems().setSkip(true); } postProcessPageModel(pageModelExtractor.getClazz(), process); page.putField(pageModelExtractor.getClazz().getCanonicalName(), process); + extractLinks(page, pageModelExtractor.getHelpUrlRegionSelector(), pageModelExtractor.getHelpUrlPatterns()); + extractLinks(page, pageModelExtractor.getTargetUrlRegionSelector(), pageModelExtractor.getTargetUrlPatterns()); } - for (String link : page.getHtml().links().all()) { - for (Pattern targetUrlPattern : targetUrlPatterns) { - if (targetUrlPattern.matcher(link).matches()){ - page.addTargetRequest(new Request(link)); + } + + private void extractLinks(Page page, Selector urlRegionSelector, List urlPatterns) { + List links; + if (urlRegionSelector == null) { + links = page.getHtml().links().all(); + } else { + links = urlRegionSelector.selectList(page.getHtml().toString()); + } + for (String link : links) { + for (Pattern targetUrlPattern : urlPatterns) { + Matcher matcher = targetUrlPattern.matcher(link); + if (matcher.find()) { + page.addTargetRequest(new Request(matcher.group(1))); } } } } - protected void postProcessPageModel(Class clazz, Object object){ + protected void postProcessPageModel(Class clazz, Object object) { } @Override diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/oo/PageModelExtractor.java b/webmagic-core/src/main/java/us/codecraft/webmagic/oo/PageModelExtractor.java index 83a4d310..8a0d81b4 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/oo/PageModelExtractor.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/oo/PageModelExtractor.java @@ -21,14 +21,16 @@ class PageModelExtractor { private List targetUrlPatterns = new ArrayList(); + private Selector targetUrlRegionSelector; + private List helpUrlPatterns = new ArrayList(); + private Selector helpUrlRegionSelector; + private Class clazz; private List fieldExtractors; - private AfterExtractor afterExtractor; - public static PageModelExtractor create(Class clazz) { PageModelExtractor pageModelExtractor = new PageModelExtractor(); pageModelExtractor.init(clazz); @@ -39,13 +41,6 @@ class PageModelExtractor { this.clazz = clazz; initTargetUrlPatterns(); fieldExtractors = new ArrayList(); - if (AfterExtractor.class.isAssignableFrom(clazz)) { - try { - afterExtractor = (AfterExtractor) clazz.newInstance(); - } catch (Exception e) { - throw new IllegalArgumentException(e); - } - } for (Field field : clazz.getDeclaredFields()) { field.setAccessible(true); ExtractBy extractBy = field.getAnnotation(ExtractBy.class); @@ -117,16 +112,24 @@ class PageModelExtractor { if (annotation == null) { targetUrlPatterns.add(Pattern.compile(".*")); } else { - String[] value = ((TargetUrl) annotation).value(); + TargetUrl targetUrl = (TargetUrl) annotation; + String[] value = targetUrl.value(); for (String s : value) { - targetUrlPatterns.add(Pattern.compile(s.replace(".", "\\.").replace("*", "[^\"'#]*"))); + targetUrlPatterns.add(Pattern.compile("("+s.replace(".", "\\.").replace("*", "[^\"'#]*")+")")); + } + if (!targetUrl.sourceRegion().equals("")){ + targetUrlRegionSelector = new Xpath2Selector(targetUrl.sourceRegion()); } } annotation = clazz.getAnnotation(HelpUrl.class); if (annotation != null) { - String[] value = ((HelpUrl) annotation).value(); + HelpUrl helpUrl = (HelpUrl) annotation; + String[] value = helpUrl.value(); for (String s : value) { - helpUrlPatterns.add(Pattern.compile(s.replace(".", "\\.").replace("*", "[^\"'#]*"))); + helpUrlPatterns.add(Pattern.compile("("+s.replace(".", "\\.").replace("*", "[^\"'#]*")+")")); + } + if (!helpUrl.sourceRegion().equals("")){ + helpUrlRegionSelector = new Xpath2Selector(helpUrl.sourceRegion()); } } } @@ -179,8 +182,8 @@ class PageModelExtractor { setField(o, fieldExtractor, value); } } - if (afterExtractor != null) { - afterExtractor.afterProcess(page, o); + if (AfterExtractor.class.isAssignableFrom(clazz)) { + ((AfterExtractor)o).afterProcess(page); } } catch (InstantiationException e) { e.printStackTrace(); @@ -210,4 +213,12 @@ class PageModelExtractor { List getHelpUrlPatterns() { return helpUrlPatterns; } + + Selector getTargetUrlRegionSelector() { + return targetUrlRegionSelector; + } + + Selector getHelpUrlRegionSelector() { + return helpUrlRegionSelector; + } } diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/oo/OschinaBlog.java b/webmagic-core/src/test/java/us/codecraft/webmagic/oo/OschinaBlog.java index 85d4817e..0f64aef8 100644 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/oo/OschinaBlog.java +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/oo/OschinaBlog.java @@ -9,8 +9,8 @@ import java.util.List; * @date: 13-8-1
* Time: 下午10:18
*/ -@TargetUrl("http://my.oschina.net/flashsword/blog/*") -public class OschinaBlog implements AfterExtractor { +@TargetUrl(value="http://my.oschina.net/flashsword/blog/*",sourceRegion = "//div[@class='BlogLinks']") +public class OschinaBlog implements AfterExtractor { @ExtractBy("//title") private String title; @@ -22,7 +22,7 @@ public class OschinaBlog implements AfterExtractor { private List tags; @Override - public void afterProcess(Page page, OschinaBlog oschinaBlog) { + public void afterProcess(Page page) { content = null; } }