update:PatternHandler

pull/94/head
Tian 11 years ago
parent 843e928c2c
commit 99e12aafaa

@ -6,6 +6,7 @@ import us.codecraft.webmagic.ResultItems;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.handler.PatternHandler;
import us.codecraft.webmagic.handler.SubPageProcessor;
import us.codecraft.webmagic.pipeline.PatternPipeline;
import us.codecraft.webmagic.processor.PatternPageProcessor;
@ -32,21 +33,23 @@ public class PatternProcessorDemo {
PatternHandler handler = new PatternHandler("http://item.jd.com/.*") {
@Override
public void onExtract(Page page) {
public SubPageProcessor.MatchOtherProcessor process(Page page) {
log.info("Extracting from " + page.getUrl());
page.putField("test", "hello world:)");
return MatchOtherProcessor.YES;
}
@Override
public void onHandle(ResultItems result, Task task) {
public void handle(ResultItems result, Task task) {
log.info("Handling " + result.getRequest().getUrl());
log.info("Retrieved test=" + result.get("test"));
}
};
handler.register(processor, pipeline);
processor.addHandler(handler);
pipeline.addHandler(handler);
Spider.create(processor).thread(5).addPipeline(pipeline).runAsync();
}

@ -3,8 +3,6 @@ package us.codecraft.webmagic.handler;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.ResultItems;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.pipeline.PatternPipeline;
import us.codecraft.webmagic.processor.PatternPageProcessor;
import java.util.UUID;
@ -17,7 +15,7 @@ import java.util.UUID;
* A PatternHandler is in charge of both page extraction and data processing by implementing
* its two abstract methods.
*/
public abstract class PatternHandler {
public abstract class PatternHandler implements SubPageProcessor {
/**
* identity of the handler.
@ -47,46 +45,25 @@ public abstract class PatternHandler {
return url.matches(pattern);
}
/**
* registers to both the page processor and the pipeline so the handler could take charge of
* both end of procedure.
*
* @param processor
* the processor to handle
* @param pipeline
* the pipeline to handle
*/
public void register(PatternPageProcessor processor, PatternPipeline pipeline) {
processor.addHandler(this);
pipeline.addHandler(this);
}
public void unregister(PatternPageProcessor processor, PatternPipeline pipeline) {
processor.removeHandler(this);
pipeline.removeHandler(this);
}
public boolean process(Page page) {
public boolean processPage(Page page) {
if(match(page.getUrl().toString())) {
page.putField(id, true);
onExtract(page);
process(page);
return true;
} else {
return false;
}
}
public boolean process(ResultItems resultItems, Task task) {
public boolean processResult(ResultItems resultItems, Task task) {
if(resultItems.isSkip()) {
return false;
}
if(match(resultItems.getRequest().getUrl()) && resultItems.get(id) != null) {
onHandle(resultItems, task);
handle(resultItems, task);
return true;
} else {
return false;
@ -94,20 +71,20 @@ public abstract class PatternHandler {
}
/**
* implements this method to extract from page.
*
* @param page
* the page to extract
*/
public abstract void onExtract(Page page);
/**
* implements this method to handle the extraction result.
* override this method to handle the extraction result. this method MUST use
* with PatternPipeline
*
* @param result
* extraction result
* @param task
*/
public abstract void onHandle(ResultItems result, Task task);
public void handle(ResultItems result, Task task) {
}
@Override
public boolean match(Page page) {
return match(page.getUrl().toString());
}
}

@ -8,26 +8,27 @@ import us.codecraft.webmagic.Page;
*/
public interface SubPageProcessor {
/**
* Check whether the SubPageProcessor can process the page.<br></br>
* Please DO NOT change page status in this method.
*
* @param page
* @return
*/
public boolean match(Page page);
/**
* Check whether the SubPageProcessor can process the page.<br></br>
* Please DO NOT change page status in this method.
*
* @param page
*
* @return
*/
public boolean match(Page page);
/**
*
* process the page, extract urls to fetch, extract the data and store
*
* @param page
* @return whether continue to match
*/
public MatchOtherProcessor process(Page page);
/**
* process the page, extract urls to fetch, extract the data and store
*
* @param page
*
* @return whether continue to match
*/
public MatchOtherProcessor process(Page page);
public enum MatchOtherProcessor {
YES, NO;
}
public enum MatchOtherProcessor {
YES, NO
}
}

@ -22,7 +22,6 @@ public class PatternPipeline implements Pipeline {
*
* @param handler the pattern handler
*
* @see PatternHandler#register
*/
public void addHandler(PatternHandler handler) {
@ -38,7 +37,7 @@ public class PatternPipeline implements Pipeline {
public void process(ResultItems resultItems, Task task) {
for(PatternHandler handler : handlers) {
handler.process(resultItems, task);
handler.processResult(resultItems, task);
}
}
}

@ -47,18 +47,16 @@ public class PatternPageProcessor implements PageProcessor {
page.addTargetRequests(requests);
for(PatternHandler handler : handlers) {
if(handler.match(page.getUrl().toString())) {
handler.process(page);
handler.processPage(page);
}
}
}
/**
* A handler works only if it is added to BOTH the page processor and the pipeline.
* Uses PatternHandler's register instead.
*
* @param handler the pattern handler
*
* @see PatternHandler#register
*
*/
public void addHandler(PatternHandler handler) {

Loading…
Cancel
Save