refactor subpageprossor etc. #94

pull/121/head
yihua.huang 11 years ago
parent acb63d55d7
commit f973889cda

@ -1,56 +0,0 @@
package us.codecraft.webmagic.example;
import org.apache.log4j.Logger;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.ResultItems;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.handler.PatternHandler;
import us.codecraft.webmagic.handler.SubPageProcessor;
import us.codecraft.webmagic.pipeline.PatternPipeline;
import us.codecraft.webmagic.processor.PatternPageProcessor;
/**
* Created with IntelliJ IDEA.
* User: Sebastian MA
* Date: April 04, 2014
* Time: 21:23
*/
public class PatternProcessorDemo {
private static Logger log = Logger.getLogger(PatternProcessorDemo.class);
public static void main(String... args) {
PatternPageProcessor processor
= new PatternPageProcessor("http://item.jd.com/981821.html",
PatternPageProcessor.TARGET_PATTERN_ALL
);
PatternPipeline pipeline = new PatternPipeline();
// define a handler which handles only "http://item.jd.com/.*"
PatternHandler handler = new PatternHandler("http://item.jd.com/.*") {
@Override
public SubPageProcessor.MatchOtherProcessor process(Page page) {
log.info("Extracting from " + page.getUrl());
page.putField("test", "hello world:)");
return MatchOtherProcessor.YES;
}
@Override
public void handle(ResultItems result, Task task) {
log.info("Handling " + result.getRequest().getUrl());
log.info("Retrieved test=" + result.get("test"));
}
};
processor.addHandler(handler);
pipeline.addHandler(handler);
Spider.create(processor).thread(5).addPipeline(pipeline).runAsync();
}
}

@ -0,0 +1,66 @@
package us.codecraft.webmagic.example;
import org.apache.log4j.Logger;
import us.codecraft.webmagic.*;
import us.codecraft.webmagic.handler.CompositePageProcessor;
import us.codecraft.webmagic.handler.CompositePipeline;
import us.codecraft.webmagic.handler.PatternProcessor;
import us.codecraft.webmagic.handler.RequestMatcher;
/**
* Created with IntelliJ IDEA.
* User: Sebastian MA
* Date: April 04, 2014
* Time: 21:23
*/
public class PatternProcessorExample {
private static Logger log = Logger.getLogger(PatternProcessorExample.class);
public static void main(String... args) {
// define a patternProcessor which handles only "http://item.jd.com/.*"
PatternProcessor githubRepoProcessor = new PatternProcessor("https://github\\.com/[\\w\\-]+/[\\w\\-]+") {
@Override
public RequestMatcher.MatchOther processPage(Page page) {
page.putField("reponame", page.getHtml().xpath("//h1[@class='entry-title public']/strong/a/text()").toString());
return RequestMatcher.MatchOther.YES;
}
@Override
public RequestMatcher.MatchOther processResult(ResultItems resultItems, Task task) {
log.info("Extracting from repo" + resultItems.getRequest());
System.out.println(resultItems.get("reponame"));
return RequestMatcher.MatchOther.YES;
}
};
PatternProcessor githubUserProcessor = new PatternProcessor("https://github\\.com/[\\w\\-]+") {
@Override
public RequestMatcher.MatchOther processPage(Page page) {
log.info("Extracting from " + page.getUrl());
page.addTargetRequests(page.getHtml().links().regex("https://github\\.com/[\\w\\-]+/[\\w\\-]+").all());
page.addTargetRequests(page.getHtml().links().regex("https://github\\.com/[\\w\\-]+").all());
page.putField("username", page.getHtml().xpath("//span[@class='vcard-fullname']/text()").toString());
return RequestMatcher.MatchOther.YES;
}
@Override
public RequestMatcher.MatchOther processResult(ResultItems resultItems, Task task) {
System.out.println(resultItems.get("username"));
return RequestMatcher.MatchOther.YES;
}
};
CompositePageProcessor pageProcessor = new CompositePageProcessor(Site.me().setDomain("github.com"));
CompositePipeline pipeline = new CompositePipeline();
pageProcessor.setSubPageProcessors(githubRepoProcessor, githubUserProcessor);
pipeline.setSubPipeline(githubRepoProcessor, githubUserProcessor);
Spider.create(pageProcessor).addUrl("https://github.com/code4craft").thread(5).addPipeline(pipeline).runAsync();
}
}

@ -15,14 +15,18 @@ public class CompositePageProcessor implements PageProcessor {
private Site site;
private List<SubPageProcessor> subPageProcessors;
private List<SubPageProcessor> subPageProcessors = new ArrayList<SubPageProcessor>();
public CompositePageProcessor(Site site) {
this.site = site;
}
@Override
public void process(Page page) {
for (SubPageProcessor subPageProcessor : subPageProcessors) {
if (subPageProcessor.match(page)) {
SubPageProcessor.MatchOtherProcessor matchOtherProcessorProcessor = subPageProcessor.process(page);
if (matchOtherProcessorProcessor == null || matchOtherProcessorProcessor != SubPageProcessor.MatchOtherProcessor.YES) {
if (subPageProcessor.match(page.getRequest())) {
SubPageProcessor.MatchOther matchOtherProcessorProcessor = subPageProcessor.processPage(page);
if (matchOtherProcessorProcessor == null || matchOtherProcessorProcessor != SubPageProcessor.MatchOther.YES) {
return;
}
}
@ -34,6 +38,11 @@ public class CompositePageProcessor implements PageProcessor {
return this;
}
public CompositePageProcessor addSubPageProcessor(SubPageProcessor subPageProcessor) {
this.subPageProcessors.add(subPageProcessor);
return this;
}
public CompositePageProcessor setSubPageProcessors(SubPageProcessor... subPageProcessors) {
this.subPageProcessors = new ArrayList<SubPageProcessor>();
for (SubPageProcessor subPageProcessor : subPageProcessors) {

@ -0,0 +1,42 @@
package us.codecraft.webmagic.handler;
import us.codecraft.webmagic.ResultItems;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.pipeline.Pipeline;
import java.util.ArrayList;
import java.util.List;
/**
* @author code4crafer@gmail.com
*/
public class CompositePipeline implements Pipeline {
private List<SubPipeline> subPipelines = new ArrayList<SubPipeline>();
@Override
public void process(ResultItems resultItems, Task task) {
for (SubPipeline subPipeline : subPipelines) {
if (subPipeline.match(resultItems.getRequest())) {
RequestMatcher.MatchOther matchOtherProcessorProcessor = subPipeline.processResult(resultItems, task);
if (matchOtherProcessorProcessor == null || matchOtherProcessorProcessor != RequestMatcher.MatchOther.YES) {
return;
}
}
}
}
public CompositePipeline addSubPipeline(SubPipeline subPipeline) {
this.subPipelines.add(subPipeline);
return this;
}
public CompositePipeline setSubPipeline(SubPipeline... subPipelines) {
this.subPipelines = new ArrayList<SubPipeline>();
for (SubPipeline subPipeline : subPipelines) {
this.subPipelines.add(subPipeline);
}
return this;
}
}

@ -1,90 +0,0 @@
package us.codecraft.webmagic.handler;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.ResultItems;
import us.codecraft.webmagic.Task;
import java.util.UUID;
/**
* Created with IntelliJ IDEA.
* User: Sebastian MA
* Date: April 03, 2014
* Time: 10:00
* <p></p>
* A PatternHandler is in charge of both page extraction and data processing by implementing
* its two abstract methods.
*/
public abstract class PatternHandler implements SubPageProcessor {
/**
* identity of the handler.
*/
protected String id;
/**
* match pattern. only matched page should be handled.
*/
protected String pattern;
/**
* @param pattern
* url pattern to handle
*/
protected PatternHandler(String pattern) {
this.pattern = pattern;
this.id = UUID.randomUUID().toString();
}
/**
* determine if the page should be handled.
*/
public boolean match(String url) {
return url.matches(pattern);
}
public boolean processPage(Page page) {
if(match(page.getUrl().toString())) {
page.putField(id, true);
process(page);
return true;
} else {
return false;
}
}
public boolean processResult(ResultItems resultItems, Task task) {
if(resultItems.isSkip()) {
return false;
}
if(match(resultItems.getRequest().getUrl()) && resultItems.get(id) != null) {
handle(resultItems, task);
return true;
} else {
return false;
}
}
/**
* override this method to handle the extraction result. this method MUST use
* with PatternPipeline
*
* @param result
* extraction result
* @param task
*/
public void handle(ResultItems result, Task task) {
}
@Override
public boolean match(Page page) {
return match(page.getUrl().toString());
}
}

@ -0,0 +1,13 @@
package us.codecraft.webmagic.handler;
/**
* @author code4crafer@gmail.com
*/
public abstract class PatternProcessor extends PatternRequestMatcher implements SubPipeline, SubPageProcessor {
/**
* @param pattern url pattern to handle
*/
public PatternProcessor(String pattern) {
super(pattern);
}
}

@ -0,0 +1,37 @@
package us.codecraft.webmagic.handler;
import us.codecraft.webmagic.Request;
import java.util.regex.Pattern;
/**
* Created with IntelliJ IDEA.
* User: Sebastian MA
* Date: April 03, 2014
* Time: 10:00
* <p></p>
* A PatternHandler is in charge of both page extraction and data processing by implementing
* its two abstract methods.
*/
public abstract class PatternRequestMatcher implements RequestMatcher {
/**
* match pattern. only matched page should be handled.
*/
protected String pattern;
private Pattern patternCompiled;
/**
* @param pattern url pattern to handle
*/
public PatternRequestMatcher(String pattern) {
this.pattern = pattern;
this.patternCompiled = Pattern.compile(pattern);
}
@Override
public boolean match(Request request) {
return patternCompiled.matcher(request.getUrl()).find();
}
}

@ -0,0 +1,24 @@
package us.codecraft.webmagic.handler;
import us.codecraft.webmagic.Request;
/**
* @author code4crafer@gmail.com
* @since 0.5.0
*/
public interface RequestMatcher {
/**
* Check whether to process the page.<br></br>
* Please DO NOT change page status in this method.
*
* @param page
*
* @return
*/
public boolean match(Request page);
public enum MatchOther {
YES, NO
}
}

@ -6,17 +6,7 @@ import us.codecraft.webmagic.Page;
* @author code4crafter@gmail.com
* @date 14-4-5
*/
public interface SubPageProcessor {
/**
* Check whether the SubPageProcessor can process the page.<br></br>
* Please DO NOT change page status in this method.
*
* @param page
*
* @return
*/
public boolean match(Page page);
public interface SubPageProcessor extends RequestMatcher {
/**
* process the page, extract urls to fetch, extract the data and store
@ -25,10 +15,6 @@ public interface SubPageProcessor {
*
* @return whether continue to match
*/
public MatchOtherProcessor process(Page page);
public enum MatchOtherProcessor {
YES, NO
}
public MatchOther processPage(Page page);
}

@ -0,0 +1,21 @@
package us.codecraft.webmagic.handler;
import us.codecraft.webmagic.ResultItems;
import us.codecraft.webmagic.Task;
/**
* @author code4crafer@gmail.com
* @since 0.5.0
*/
public interface SubPipeline extends RequestMatcher {
/**
* process the page, extract urls to fetch, extract the data and store
*
* @param page
* @param task
* @return whether continue to match
*/
public MatchOther processResult(ResultItems resultItems, Task task);
}

@ -1,43 +0,0 @@
package us.codecraft.webmagic.pipeline;
import us.codecraft.webmagic.ResultItems;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.handler.PatternHandler;
import java.util.ArrayList;
/**
* Created with IntelliJ IDEA.
* User: Sebastian MA
* Date: April 04, 2014
* Time: 20:44
*/
public class PatternPipeline implements Pipeline {
protected ArrayList<PatternHandler> handlers = new ArrayList<PatternHandler>();
/**
* A handler works only if it is added to BOTH the page processor and the pipeline.
* Uses PatternHandler's register instead.
*
* @param handler the pattern handler
*
*/
public void addHandler(PatternHandler handler) {
handlers.add(handler);
}
public void removeHandler(PatternHandler handler) {
handlers.remove(handler);
}
@Override
public void process(ResultItems resultItems, Task task) {
for(PatternHandler handler : handlers) {
handler.processResult(resultItems, task);
}
}
}

@ -1,76 +0,0 @@
package us.codecraft.webmagic.processor;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.handler.PatternHandler;
import us.codecraft.webmagic.utils.UrlUtils;
import java.util.ArrayList;
import java.util.List;
/**
* Created with IntelliJ IDEA.
* User: Sebastian MA
* Date: April 04, 2014
* Time: 15:36
* <p></p>
* A PatternPageProcessor uses PatternHandler to setup extraction rules for specific url pattern.
*
* @see us.codecraft.webmagic.handler.PatternHandler
*/
public class PatternPageProcessor implements PageProcessor {
public static final String TARGET_PATTERN_ALL = "http://*";
protected Site site;
protected String targetPattern;
protected ArrayList<PatternHandler> handlers = new ArrayList<PatternHandler>();
public PatternPageProcessor(String startUrl, String targetPattern) {
this.targetPattern = targetPattern;
this.site = Site.me().addStartUrl(startUrl).setDomain(UrlUtils.getDomain(startUrl));
this.targetPattern = "(" + targetPattern.replace(".", "\\.").replace("*",
"[^\"'#]*") + ")";
site.setUserAgent("Chrome/5.0.354.0");
}
@Override
public void process(Page page) {
List<String> requests = page.getHtml().links().regex(targetPattern).all();
page.addTargetRequests(requests);
for(PatternHandler handler : handlers) {
if(handler.match(page.getUrl().toString())) {
handler.processPage(page);
}
}
}
/**
*
* @param handler the pattern handler
*
*
*/
public void addHandler(PatternHandler handler) {
handlers.add(handler);
}
public void removeHandler(PatternHandler handler) {
handlers.remove(handler);
}
@Override
public Site getSite() {
return site;
}
}
Loading…
Cancel
Save