From 99e12aafaa6906c6e9800fd094e3b60b05f55011 Mon Sep 17 00:00:00 2001 From: Tian Date: Sun, 13 Apr 2014 10:14:39 +0800 Subject: [PATCH] update:PatternHandler --- .../example/PatternProcessorDemo.java | 9 ++-- .../webmagic/handler/PatternHandler.java | 53 ++++++------------- .../webmagic/handler/SubPageProcessor.java | 39 +++++++------- .../webmagic/pipeline/PatternPipeline.java | 3 +- .../processor/PatternPageProcessor.java | 6 +-- 5 files changed, 44 insertions(+), 66 deletions(-) diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/example/PatternProcessorDemo.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/example/PatternProcessorDemo.java index 51a9484e..e2303a07 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/example/PatternProcessorDemo.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/example/PatternProcessorDemo.java @@ -6,6 +6,7 @@ import us.codecraft.webmagic.ResultItems; import us.codecraft.webmagic.Spider; import us.codecraft.webmagic.Task; import us.codecraft.webmagic.handler.PatternHandler; +import us.codecraft.webmagic.handler.SubPageProcessor; import us.codecraft.webmagic.pipeline.PatternPipeline; import us.codecraft.webmagic.processor.PatternPageProcessor; @@ -32,21 +33,23 @@ public class PatternProcessorDemo { PatternHandler handler = new PatternHandler("http://item.jd.com/.*") { @Override - public void onExtract(Page page) { + public SubPageProcessor.MatchOtherProcessor process(Page page) { log.info("Extracting from " + page.getUrl()); page.putField("test", "hello world:)"); + return MatchOtherProcessor.YES; } @Override - public void onHandle(ResultItems result, Task task) { + public void handle(ResultItems result, Task task) { log.info("Handling " + result.getRequest().getUrl()); log.info("Retrieved test=" + result.get("test")); } }; - handler.register(processor, pipeline); + processor.addHandler(handler); + pipeline.addHandler(handler); Spider.create(processor).thread(5).addPipeline(pipeline).runAsync(); } diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/handler/PatternHandler.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/handler/PatternHandler.java index 51e44e06..4be03de1 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/handler/PatternHandler.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/handler/PatternHandler.java @@ -3,8 +3,6 @@ package us.codecraft.webmagic.handler; import us.codecraft.webmagic.Page; import us.codecraft.webmagic.ResultItems; import us.codecraft.webmagic.Task; -import us.codecraft.webmagic.pipeline.PatternPipeline; -import us.codecraft.webmagic.processor.PatternPageProcessor; import java.util.UUID; @@ -17,7 +15,7 @@ import java.util.UUID; * A PatternHandler is in charge of both page extraction and data processing by implementing * its two abstract methods. */ -public abstract class PatternHandler { +public abstract class PatternHandler implements SubPageProcessor { /** * identity of the handler. @@ -47,46 +45,25 @@ public abstract class PatternHandler { return url.matches(pattern); } - /** - * registers to both the page processor and the pipeline so the handler could take charge of - * both end of procedure. - * - * @param processor - * the processor to handle - * @param pipeline - * the pipeline to handle - */ - public void register(PatternPageProcessor processor, PatternPipeline pipeline) { - - processor.addHandler(this); - pipeline.addHandler(this); - } - - public void unregister(PatternPageProcessor processor, PatternPipeline pipeline) { - - processor.removeHandler(this); - pipeline.removeHandler(this); - } - - public boolean process(Page page) { + public boolean processPage(Page page) { if(match(page.getUrl().toString())) { page.putField(id, true); - onExtract(page); + process(page); return true; } else { return false; } } - public boolean process(ResultItems resultItems, Task task) { + public boolean processResult(ResultItems resultItems, Task task) { if(resultItems.isSkip()) { return false; } if(match(resultItems.getRequest().getUrl()) && resultItems.get(id) != null) { - onHandle(resultItems, task); + handle(resultItems, task); return true; } else { return false; @@ -94,20 +71,20 @@ public abstract class PatternHandler { } /** - * implements this method to extract from page. - * - * @param page - * the page to extract - */ - public abstract void onExtract(Page page); - - /** - * implements this method to handle the extraction result. + * override this method to handle the extraction result. this method MUST use + * with PatternPipeline * * @param result * extraction result * @param task */ - public abstract void onHandle(ResultItems result, Task task); + public void handle(ResultItems result, Task task) { + } + + @Override + public boolean match(Page page) { + + return match(page.getUrl().toString()); + } } diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/handler/SubPageProcessor.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/handler/SubPageProcessor.java index c8805006..3778a620 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/handler/SubPageProcessor.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/handler/SubPageProcessor.java @@ -8,26 +8,27 @@ import us.codecraft.webmagic.Page; */ public interface SubPageProcessor { - /** - * Check whether the SubPageProcessor can process the page.

- * Please DO NOT change page status in this method. - * - * @param page - * @return - */ - public boolean match(Page page); + /** + * Check whether the SubPageProcessor can process the page.

+ * Please DO NOT change page status in this method. + * + * @param page + * + * @return + */ + public boolean match(Page page); - /** - * - * process the page, extract urls to fetch, extract the data and store - * - * @param page - * @return whether continue to match - */ - public MatchOtherProcessor process(Page page); + /** + * process the page, extract urls to fetch, extract the data and store + * + * @param page + * + * @return whether continue to match + */ + public MatchOtherProcessor process(Page page); - public enum MatchOtherProcessor { - YES, NO; - } + public enum MatchOtherProcessor { + YES, NO + } } diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/PatternPipeline.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/PatternPipeline.java index 582b1627..c614114d 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/PatternPipeline.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/PatternPipeline.java @@ -22,7 +22,6 @@ public class PatternPipeline implements Pipeline { * * @param handler the pattern handler * - * @see PatternHandler#register */ public void addHandler(PatternHandler handler) { @@ -38,7 +37,7 @@ public class PatternPipeline implements Pipeline { public void process(ResultItems resultItems, Task task) { for(PatternHandler handler : handlers) { - handler.process(resultItems, task); + handler.processResult(resultItems, task); } } } diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/processor/PatternPageProcessor.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/processor/PatternPageProcessor.java index d7d909c7..51dbabe3 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/processor/PatternPageProcessor.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/processor/PatternPageProcessor.java @@ -47,18 +47,16 @@ public class PatternPageProcessor implements PageProcessor { page.addTargetRequests(requests); for(PatternHandler handler : handlers) { if(handler.match(page.getUrl().toString())) { - handler.process(page); + handler.processPage(page); } } } /** - * A handler works only if it is added to BOTH the page processor and the pipeline. - * Uses PatternHandler's register instead. * * @param handler the pattern handler * - * @see PatternHandler#register + * */ public void addHandler(PatternHandler handler) {