diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/example/PatternProcessorDemo.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/example/PatternProcessorDemo.java
index 51a9484e..e2303a07 100644
--- a/webmagic-extension/src/main/java/us/codecraft/webmagic/example/PatternProcessorDemo.java
+++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/example/PatternProcessorDemo.java
@@ -6,6 +6,7 @@ import us.codecraft.webmagic.ResultItems;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.handler.PatternHandler;
+import us.codecraft.webmagic.handler.SubPageProcessor;
import us.codecraft.webmagic.pipeline.PatternPipeline;
import us.codecraft.webmagic.processor.PatternPageProcessor;
@@ -32,21 +33,23 @@ public class PatternProcessorDemo {
PatternHandler handler = new PatternHandler("http://item.jd.com/.*") {
@Override
- public void onExtract(Page page) {
+ public SubPageProcessor.MatchOtherProcessor process(Page page) {
log.info("Extracting from " + page.getUrl());
page.putField("test", "hello world:)");
+ return MatchOtherProcessor.YES;
}
@Override
- public void onHandle(ResultItems result, Task task) {
+ public void handle(ResultItems result, Task task) {
log.info("Handling " + result.getRequest().getUrl());
log.info("Retrieved test=" + result.get("test"));
}
};
- handler.register(processor, pipeline);
+ processor.addHandler(handler);
+ pipeline.addHandler(handler);
Spider.create(processor).thread(5).addPipeline(pipeline).runAsync();
}
diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/handler/PatternHandler.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/handler/PatternHandler.java
index 51e44e06..4be03de1 100644
--- a/webmagic-extension/src/main/java/us/codecraft/webmagic/handler/PatternHandler.java
+++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/handler/PatternHandler.java
@@ -3,8 +3,6 @@ package us.codecraft.webmagic.handler;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.ResultItems;
import us.codecraft.webmagic.Task;
-import us.codecraft.webmagic.pipeline.PatternPipeline;
-import us.codecraft.webmagic.processor.PatternPageProcessor;
import java.util.UUID;
@@ -17,7 +15,7 @@ import java.util.UUID;
* A PatternHandler is in charge of both page extraction and data processing by implementing
* its two abstract methods.
*/
-public abstract class PatternHandler {
+public abstract class PatternHandler implements SubPageProcessor {
/**
* identity of the handler.
@@ -47,46 +45,25 @@ public abstract class PatternHandler {
return url.matches(pattern);
}
- /**
- * registers to both the page processor and the pipeline so the handler could take charge of
- * both end of procedure.
- *
- * @param processor
- * the processor to handle
- * @param pipeline
- * the pipeline to handle
- */
- public void register(PatternPageProcessor processor, PatternPipeline pipeline) {
-
- processor.addHandler(this);
- pipeline.addHandler(this);
- }
-
- public void unregister(PatternPageProcessor processor, PatternPipeline pipeline) {
-
- processor.removeHandler(this);
- pipeline.removeHandler(this);
- }
-
- public boolean process(Page page) {
+ public boolean processPage(Page page) {
if(match(page.getUrl().toString())) {
page.putField(id, true);
- onExtract(page);
+ process(page);
return true;
} else {
return false;
}
}
- public boolean process(ResultItems resultItems, Task task) {
+ public boolean processResult(ResultItems resultItems, Task task) {
if(resultItems.isSkip()) {
return false;
}
if(match(resultItems.getRequest().getUrl()) && resultItems.get(id) != null) {
- onHandle(resultItems, task);
+ handle(resultItems, task);
return true;
} else {
return false;
@@ -94,20 +71,20 @@ public abstract class PatternHandler {
}
/**
- * implements this method to extract from page.
- *
- * @param page
- * the page to extract
- */
- public abstract void onExtract(Page page);
-
- /**
- * implements this method to handle the extraction result.
+ * override this method to handle the extraction result. this method MUST use
+ * with PatternPipeline
*
* @param result
* extraction result
* @param task
*/
- public abstract void onHandle(ResultItems result, Task task);
+ public void handle(ResultItems result, Task task) {
+ }
+
+ @Override
+ public boolean match(Page page) {
+
+ return match(page.getUrl().toString());
+ }
}
diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/handler/SubPageProcessor.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/handler/SubPageProcessor.java
index c8805006..3778a620 100644
--- a/webmagic-extension/src/main/java/us/codecraft/webmagic/handler/SubPageProcessor.java
+++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/handler/SubPageProcessor.java
@@ -8,26 +8,27 @@ import us.codecraft.webmagic.Page;
*/
public interface SubPageProcessor {
- /**
- * Check whether the SubPageProcessor can process the page.
- * Please DO NOT change page status in this method.
- *
- * @param page
- * @return
- */
- public boolean match(Page page);
+ /**
+ * Check whether the SubPageProcessor can process the page.
+ * Please DO NOT change page status in this method.
+ *
+ * @param page
+ *
+ * @return
+ */
+ public boolean match(Page page);
- /**
- *
- * process the page, extract urls to fetch, extract the data and store
- *
- * @param page
- * @return whether continue to match
- */
- public MatchOtherProcessor process(Page page);
+ /**
+ * process the page, extract urls to fetch, extract the data and store
+ *
+ * @param page
+ *
+ * @return whether continue to match
+ */
+ public MatchOtherProcessor process(Page page);
- public enum MatchOtherProcessor {
- YES, NO;
- }
+ public enum MatchOtherProcessor {
+ YES, NO
+ }
}
diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/PatternPipeline.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/PatternPipeline.java
index 582b1627..c614114d 100644
--- a/webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/PatternPipeline.java
+++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/PatternPipeline.java
@@ -22,7 +22,6 @@ public class PatternPipeline implements Pipeline {
*
* @param handler the pattern handler
*
- * @see PatternHandler#register
*/
public void addHandler(PatternHandler handler) {
@@ -38,7 +37,7 @@ public class PatternPipeline implements Pipeline {
public void process(ResultItems resultItems, Task task) {
for(PatternHandler handler : handlers) {
- handler.process(resultItems, task);
+ handler.processResult(resultItems, task);
}
}
}
diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/processor/PatternPageProcessor.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/processor/PatternPageProcessor.java
index d7d909c7..51dbabe3 100644
--- a/webmagic-extension/src/main/java/us/codecraft/webmagic/processor/PatternPageProcessor.java
+++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/processor/PatternPageProcessor.java
@@ -47,18 +47,16 @@ public class PatternPageProcessor implements PageProcessor {
page.addTargetRequests(requests);
for(PatternHandler handler : handlers) {
if(handler.match(page.getUrl().toString())) {
- handler.process(page);
+ handler.processPage(page);
}
}
}
/**
- * A handler works only if it is added to BOTH the page processor and the pipeline.
- * Uses PatternHandler's register instead.
*
* @param handler the pattern handler
*
- * @see PatternHandler#register
+ *
*/
public void addHandler(PatternHandler handler) {