refactor subpageprossor etc. #94
parent
acb63d55d7
commit
f973889cda
@ -1,56 +0,0 @@
|
||||
package us.codecraft.webmagic.example;
|
||||
|
||||
import org.apache.log4j.Logger;
|
||||
import us.codecraft.webmagic.Page;
|
||||
import us.codecraft.webmagic.ResultItems;
|
||||
import us.codecraft.webmagic.Spider;
|
||||
import us.codecraft.webmagic.Task;
|
||||
import us.codecraft.webmagic.handler.PatternHandler;
|
||||
import us.codecraft.webmagic.handler.SubPageProcessor;
|
||||
import us.codecraft.webmagic.pipeline.PatternPipeline;
|
||||
import us.codecraft.webmagic.processor.PatternPageProcessor;
|
||||
|
||||
/**
|
||||
* Created with IntelliJ IDEA.
|
||||
* User: Sebastian MA
|
||||
* Date: April 04, 2014
|
||||
* Time: 21:23
|
||||
*/
|
||||
public class PatternProcessorDemo {
|
||||
|
||||
private static Logger log = Logger.getLogger(PatternProcessorDemo.class);
|
||||
|
||||
public static void main(String... args) {
|
||||
|
||||
PatternPageProcessor processor
|
||||
= new PatternPageProcessor("http://item.jd.com/981821.html",
|
||||
PatternPageProcessor.TARGET_PATTERN_ALL
|
||||
);
|
||||
|
||||
PatternPipeline pipeline = new PatternPipeline();
|
||||
|
||||
// define a handler which handles only "http://item.jd.com/.*"
|
||||
PatternHandler handler = new PatternHandler("http://item.jd.com/.*") {
|
||||
|
||||
@Override
|
||||
public SubPageProcessor.MatchOtherProcessor process(Page page) {
|
||||
|
||||
log.info("Extracting from " + page.getUrl());
|
||||
page.putField("test", "hello world:)");
|
||||
return MatchOtherProcessor.YES;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void handle(ResultItems result, Task task) {
|
||||
|
||||
log.info("Handling " + result.getRequest().getUrl());
|
||||
log.info("Retrieved test=" + result.get("test"));
|
||||
}
|
||||
};
|
||||
|
||||
processor.addHandler(handler);
|
||||
pipeline.addHandler(handler);
|
||||
|
||||
Spider.create(processor).thread(5).addPipeline(pipeline).runAsync();
|
||||
}
|
||||
}
|
@ -0,0 +1,66 @@
|
||||
package us.codecraft.webmagic.example;
|
||||
|
||||
import org.apache.log4j.Logger;
|
||||
import us.codecraft.webmagic.*;
|
||||
import us.codecraft.webmagic.handler.CompositePageProcessor;
|
||||
import us.codecraft.webmagic.handler.CompositePipeline;
|
||||
import us.codecraft.webmagic.handler.PatternProcessor;
|
||||
import us.codecraft.webmagic.handler.RequestMatcher;
|
||||
|
||||
/**
|
||||
* Created with IntelliJ IDEA.
|
||||
* User: Sebastian MA
|
||||
* Date: April 04, 2014
|
||||
* Time: 21:23
|
||||
*/
|
||||
public class PatternProcessorExample {
|
||||
|
||||
private static Logger log = Logger.getLogger(PatternProcessorExample.class);
|
||||
|
||||
public static void main(String... args) {
|
||||
|
||||
// define a patternProcessor which handles only "http://item.jd.com/.*"
|
||||
PatternProcessor githubRepoProcessor = new PatternProcessor("https://github\\.com/[\\w\\-]+/[\\w\\-]+") {
|
||||
|
||||
@Override
|
||||
public RequestMatcher.MatchOther processPage(Page page) {
|
||||
page.putField("reponame", page.getHtml().xpath("//h1[@class='entry-title public']/strong/a/text()").toString());
|
||||
return RequestMatcher.MatchOther.YES;
|
||||
}
|
||||
|
||||
@Override
|
||||
public RequestMatcher.MatchOther processResult(ResultItems resultItems, Task task) {
|
||||
log.info("Extracting from repo" + resultItems.getRequest());
|
||||
System.out.println(resultItems.get("reponame"));
|
||||
return RequestMatcher.MatchOther.YES;
|
||||
}
|
||||
};
|
||||
|
||||
PatternProcessor githubUserProcessor = new PatternProcessor("https://github\\.com/[\\w\\-]+") {
|
||||
|
||||
@Override
|
||||
public RequestMatcher.MatchOther processPage(Page page) {
|
||||
log.info("Extracting from " + page.getUrl());
|
||||
page.addTargetRequests(page.getHtml().links().regex("https://github\\.com/[\\w\\-]+/[\\w\\-]+").all());
|
||||
page.addTargetRequests(page.getHtml().links().regex("https://github\\.com/[\\w\\-]+").all());
|
||||
page.putField("username", page.getHtml().xpath("//span[@class='vcard-fullname']/text()").toString());
|
||||
return RequestMatcher.MatchOther.YES;
|
||||
}
|
||||
|
||||
@Override
|
||||
public RequestMatcher.MatchOther processResult(ResultItems resultItems, Task task) {
|
||||
System.out.println(resultItems.get("username"));
|
||||
return RequestMatcher.MatchOther.YES;
|
||||
}
|
||||
};
|
||||
|
||||
CompositePageProcessor pageProcessor = new CompositePageProcessor(Site.me().setDomain("github.com"));
|
||||
CompositePipeline pipeline = new CompositePipeline();
|
||||
|
||||
pageProcessor.setSubPageProcessors(githubRepoProcessor, githubUserProcessor);
|
||||
pipeline.setSubPipeline(githubRepoProcessor, githubUserProcessor);
|
||||
|
||||
Spider.create(pageProcessor).addUrl("https://github.com/code4craft").thread(5).addPipeline(pipeline).runAsync();
|
||||
}
|
||||
|
||||
}
|
@ -0,0 +1,42 @@
|
||||
package us.codecraft.webmagic.handler;
|
||||
|
||||
import us.codecraft.webmagic.ResultItems;
|
||||
import us.codecraft.webmagic.Task;
|
||||
import us.codecraft.webmagic.pipeline.Pipeline;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* @author code4crafer@gmail.com
|
||||
*/
|
||||
public class CompositePipeline implements Pipeline {
|
||||
|
||||
private List<SubPipeline> subPipelines = new ArrayList<SubPipeline>();
|
||||
|
||||
@Override
|
||||
public void process(ResultItems resultItems, Task task) {
|
||||
for (SubPipeline subPipeline : subPipelines) {
|
||||
if (subPipeline.match(resultItems.getRequest())) {
|
||||
RequestMatcher.MatchOther matchOtherProcessorProcessor = subPipeline.processResult(resultItems, task);
|
||||
if (matchOtherProcessorProcessor == null || matchOtherProcessorProcessor != RequestMatcher.MatchOther.YES) {
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public CompositePipeline addSubPipeline(SubPipeline subPipeline) {
|
||||
this.subPipelines.add(subPipeline);
|
||||
return this;
|
||||
}
|
||||
|
||||
public CompositePipeline setSubPipeline(SubPipeline... subPipelines) {
|
||||
this.subPipelines = new ArrayList<SubPipeline>();
|
||||
for (SubPipeline subPipeline : subPipelines) {
|
||||
this.subPipelines.add(subPipeline);
|
||||
}
|
||||
return this;
|
||||
}
|
||||
|
||||
}
|
@ -1,90 +0,0 @@
|
||||
package us.codecraft.webmagic.handler;
|
||||
|
||||
import us.codecraft.webmagic.Page;
|
||||
import us.codecraft.webmagic.ResultItems;
|
||||
import us.codecraft.webmagic.Task;
|
||||
|
||||
import java.util.UUID;
|
||||
|
||||
/**
|
||||
* Created with IntelliJ IDEA.
|
||||
* User: Sebastian MA
|
||||
* Date: April 03, 2014
|
||||
* Time: 10:00
|
||||
* <p></p>
|
||||
* A PatternHandler is in charge of both page extraction and data processing by implementing
|
||||
* its two abstract methods.
|
||||
*/
|
||||
public abstract class PatternHandler implements SubPageProcessor {
|
||||
|
||||
/**
|
||||
* identity of the handler.
|
||||
*/
|
||||
protected String id;
|
||||
|
||||
/**
|
||||
* match pattern. only matched page should be handled.
|
||||
*/
|
||||
protected String pattern;
|
||||
|
||||
/**
|
||||
* @param pattern
|
||||
* url pattern to handle
|
||||
*/
|
||||
protected PatternHandler(String pattern) {
|
||||
|
||||
this.pattern = pattern;
|
||||
this.id = UUID.randomUUID().toString();
|
||||
}
|
||||
|
||||
/**
|
||||
* determine if the page should be handled.
|
||||
*/
|
||||
public boolean match(String url) {
|
||||
|
||||
return url.matches(pattern);
|
||||
}
|
||||
|
||||
public boolean processPage(Page page) {
|
||||
|
||||
if(match(page.getUrl().toString())) {
|
||||
page.putField(id, true);
|
||||
process(page);
|
||||
return true;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
public boolean processResult(ResultItems resultItems, Task task) {
|
||||
|
||||
if(resultItems.isSkip()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if(match(resultItems.getRequest().getUrl()) && resultItems.get(id) != null) {
|
||||
handle(resultItems, task);
|
||||
return true;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* override this method to handle the extraction result. this method MUST use
|
||||
* with PatternPipeline
|
||||
*
|
||||
* @param result
|
||||
* extraction result
|
||||
* @param task
|
||||
*/
|
||||
public void handle(ResultItems result, Task task) {
|
||||
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean match(Page page) {
|
||||
|
||||
return match(page.getUrl().toString());
|
||||
}
|
||||
}
|
@ -0,0 +1,13 @@
|
||||
package us.codecraft.webmagic.handler;
|
||||
|
||||
/**
|
||||
* @author code4crafer@gmail.com
|
||||
*/
|
||||
public abstract class PatternProcessor extends PatternRequestMatcher implements SubPipeline, SubPageProcessor {
|
||||
/**
|
||||
* @param pattern url pattern to handle
|
||||
*/
|
||||
public PatternProcessor(String pattern) {
|
||||
super(pattern);
|
||||
}
|
||||
}
|
@ -0,0 +1,37 @@
|
||||
package us.codecraft.webmagic.handler;
|
||||
|
||||
import us.codecraft.webmagic.Request;
|
||||
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
/**
|
||||
* Created with IntelliJ IDEA.
|
||||
* User: Sebastian MA
|
||||
* Date: April 03, 2014
|
||||
* Time: 10:00
|
||||
* <p></p>
|
||||
* A PatternHandler is in charge of both page extraction and data processing by implementing
|
||||
* its two abstract methods.
|
||||
*/
|
||||
public abstract class PatternRequestMatcher implements RequestMatcher {
|
||||
|
||||
/**
|
||||
* match pattern. only matched page should be handled.
|
||||
*/
|
||||
protected String pattern;
|
||||
|
||||
private Pattern patternCompiled;
|
||||
|
||||
/**
|
||||
* @param pattern url pattern to handle
|
||||
*/
|
||||
public PatternRequestMatcher(String pattern) {
|
||||
this.pattern = pattern;
|
||||
this.patternCompiled = Pattern.compile(pattern);
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean match(Request request) {
|
||||
return patternCompiled.matcher(request.getUrl()).find();
|
||||
}
|
||||
}
|
@ -0,0 +1,24 @@
|
||||
package us.codecraft.webmagic.handler;
|
||||
|
||||
import us.codecraft.webmagic.Request;
|
||||
|
||||
/**
|
||||
* @author code4crafer@gmail.com
|
||||
* @since 0.5.0
|
||||
*/
|
||||
public interface RequestMatcher {
|
||||
|
||||
/**
|
||||
* Check whether to process the page.<br></br>
|
||||
* Please DO NOT change page status in this method.
|
||||
*
|
||||
* @param page
|
||||
*
|
||||
* @return
|
||||
*/
|
||||
public boolean match(Request page);
|
||||
|
||||
public enum MatchOther {
|
||||
YES, NO
|
||||
}
|
||||
}
|
@ -0,0 +1,21 @@
|
||||
package us.codecraft.webmagic.handler;
|
||||
|
||||
import us.codecraft.webmagic.ResultItems;
|
||||
import us.codecraft.webmagic.Task;
|
||||
|
||||
/**
|
||||
* @author code4crafer@gmail.com
|
||||
* @since 0.5.0
|
||||
*/
|
||||
public interface SubPipeline extends RequestMatcher {
|
||||
|
||||
/**
|
||||
* process the page, extract urls to fetch, extract the data and store
|
||||
*
|
||||
* @param page
|
||||
* @param task
|
||||
* @return whether continue to match
|
||||
*/
|
||||
public MatchOther processResult(ResultItems resultItems, Task task);
|
||||
|
||||
}
|
@ -1,43 +0,0 @@
|
||||
package us.codecraft.webmagic.pipeline;
|
||||
|
||||
import us.codecraft.webmagic.ResultItems;
|
||||
import us.codecraft.webmagic.Task;
|
||||
import us.codecraft.webmagic.handler.PatternHandler;
|
||||
|
||||
import java.util.ArrayList;
|
||||
|
||||
/**
|
||||
* Created with IntelliJ IDEA.
|
||||
* User: Sebastian MA
|
||||
* Date: April 04, 2014
|
||||
* Time: 20:44
|
||||
*/
|
||||
public class PatternPipeline implements Pipeline {
|
||||
|
||||
protected ArrayList<PatternHandler> handlers = new ArrayList<PatternHandler>();
|
||||
|
||||
/**
|
||||
* A handler works only if it is added to BOTH the page processor and the pipeline.
|
||||
* Uses PatternHandler's register instead.
|
||||
*
|
||||
* @param handler the pattern handler
|
||||
*
|
||||
*/
|
||||
public void addHandler(PatternHandler handler) {
|
||||
|
||||
handlers.add(handler);
|
||||
}
|
||||
|
||||
public void removeHandler(PatternHandler handler) {
|
||||
|
||||
handlers.remove(handler);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void process(ResultItems resultItems, Task task) {
|
||||
|
||||
for(PatternHandler handler : handlers) {
|
||||
handler.processResult(resultItems, task);
|
||||
}
|
||||
}
|
||||
}
|
@ -1,76 +0,0 @@
|
||||
package us.codecraft.webmagic.processor;
|
||||
|
||||
import us.codecraft.webmagic.Page;
|
||||
import us.codecraft.webmagic.Site;
|
||||
import us.codecraft.webmagic.handler.PatternHandler;
|
||||
import us.codecraft.webmagic.utils.UrlUtils;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* Created with IntelliJ IDEA.
|
||||
* User: Sebastian MA
|
||||
* Date: April 04, 2014
|
||||
* Time: 15:36
|
||||
* <p></p>
|
||||
* A PatternPageProcessor uses PatternHandler to setup extraction rules for specific url pattern.
|
||||
*
|
||||
* @see us.codecraft.webmagic.handler.PatternHandler
|
||||
*/
|
||||
public class PatternPageProcessor implements PageProcessor {
|
||||
|
||||
public static final String TARGET_PATTERN_ALL = "http://*";
|
||||
|
||||
protected Site site;
|
||||
|
||||
protected String targetPattern;
|
||||
|
||||
protected ArrayList<PatternHandler> handlers = new ArrayList<PatternHandler>();
|
||||
|
||||
public PatternPageProcessor(String startUrl, String targetPattern) {
|
||||
|
||||
this.targetPattern = targetPattern;
|
||||
|
||||
this.site = Site.me().addStartUrl(startUrl).setDomain(UrlUtils.getDomain(startUrl));
|
||||
this.targetPattern = "(" + targetPattern.replace(".", "\\.").replace("*",
|
||||
"[^\"'#]*") + ")";
|
||||
|
||||
site.setUserAgent("Chrome/5.0.354.0");
|
||||
}
|
||||
|
||||
@Override
|
||||
public void process(Page page) {
|
||||
|
||||
|
||||
List<String> requests = page.getHtml().links().regex(targetPattern).all();
|
||||
page.addTargetRequests(requests);
|
||||
for(PatternHandler handler : handlers) {
|
||||
if(handler.match(page.getUrl().toString())) {
|
||||
handler.processPage(page);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param handler the pattern handler
|
||||
*
|
||||
*
|
||||
*/
|
||||
public void addHandler(PatternHandler handler) {
|
||||
|
||||
handlers.add(handler);
|
||||
}
|
||||
|
||||
public void removeHandler(PatternHandler handler) {
|
||||
|
||||
handlers.remove(handler);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Site getSite() {
|
||||
|
||||
return site;
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue