diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/example/PatternProcessorDemo.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/example/PatternProcessorDemo.java deleted file mode 100644 index e2303a07..00000000 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/example/PatternProcessorDemo.java +++ /dev/null @@ -1,56 +0,0 @@ -package us.codecraft.webmagic.example; - -import org.apache.log4j.Logger; -import us.codecraft.webmagic.Page; -import us.codecraft.webmagic.ResultItems; -import us.codecraft.webmagic.Spider; -import us.codecraft.webmagic.Task; -import us.codecraft.webmagic.handler.PatternHandler; -import us.codecraft.webmagic.handler.SubPageProcessor; -import us.codecraft.webmagic.pipeline.PatternPipeline; -import us.codecraft.webmagic.processor.PatternPageProcessor; - -/** - * Created with IntelliJ IDEA. - * User: Sebastian MA - * Date: April 04, 2014 - * Time: 21:23 - */ -public class PatternProcessorDemo { - - private static Logger log = Logger.getLogger(PatternProcessorDemo.class); - - public static void main(String... args) { - - PatternPageProcessor processor - = new PatternPageProcessor("http://item.jd.com/981821.html", - PatternPageProcessor.TARGET_PATTERN_ALL - ); - - PatternPipeline pipeline = new PatternPipeline(); - - // define a handler which handles only "http://item.jd.com/.*" - PatternHandler handler = new PatternHandler("http://item.jd.com/.*") { - - @Override - public SubPageProcessor.MatchOtherProcessor process(Page page) { - - log.info("Extracting from " + page.getUrl()); - page.putField("test", "hello world:)"); - return MatchOtherProcessor.YES; - } - - @Override - public void handle(ResultItems result, Task task) { - - log.info("Handling " + result.getRequest().getUrl()); - log.info("Retrieved test=" + result.get("test")); - } - }; - - processor.addHandler(handler); - pipeline.addHandler(handler); - - Spider.create(processor).thread(5).addPipeline(pipeline).runAsync(); - } -} diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/example/PatternProcessorExample.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/example/PatternProcessorExample.java new file mode 100644 index 00000000..84b3164d --- /dev/null +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/example/PatternProcessorExample.java @@ -0,0 +1,66 @@ +package us.codecraft.webmagic.example; + +import org.apache.log4j.Logger; +import us.codecraft.webmagic.*; +import us.codecraft.webmagic.handler.CompositePageProcessor; +import us.codecraft.webmagic.handler.CompositePipeline; +import us.codecraft.webmagic.handler.PatternProcessor; +import us.codecraft.webmagic.handler.RequestMatcher; + +/** + * Created with IntelliJ IDEA. + * User: Sebastian MA + * Date: April 04, 2014 + * Time: 21:23 + */ +public class PatternProcessorExample { + + private static Logger log = Logger.getLogger(PatternProcessorExample.class); + + public static void main(String... args) { + + // define a patternProcessor which handles only "http://item.jd.com/.*" + PatternProcessor githubRepoProcessor = new PatternProcessor("https://github\\.com/[\\w\\-]+/[\\w\\-]+") { + + @Override + public RequestMatcher.MatchOther processPage(Page page) { + page.putField("reponame", page.getHtml().xpath("//h1[@class='entry-title public']/strong/a/text()").toString()); + return RequestMatcher.MatchOther.YES; + } + + @Override + public RequestMatcher.MatchOther processResult(ResultItems resultItems, Task task) { + log.info("Extracting from repo" + resultItems.getRequest()); + System.out.println(resultItems.get("reponame")); + return RequestMatcher.MatchOther.YES; + } + }; + + PatternProcessor githubUserProcessor = new PatternProcessor("https://github\\.com/[\\w\\-]+") { + + @Override + public RequestMatcher.MatchOther processPage(Page page) { + log.info("Extracting from " + page.getUrl()); + page.addTargetRequests(page.getHtml().links().regex("https://github\\.com/[\\w\\-]+/[\\w\\-]+").all()); + page.addTargetRequests(page.getHtml().links().regex("https://github\\.com/[\\w\\-]+").all()); + page.putField("username", page.getHtml().xpath("//span[@class='vcard-fullname']/text()").toString()); + return RequestMatcher.MatchOther.YES; + } + + @Override + public RequestMatcher.MatchOther processResult(ResultItems resultItems, Task task) { + System.out.println(resultItems.get("username")); + return RequestMatcher.MatchOther.YES; + } + }; + + CompositePageProcessor pageProcessor = new CompositePageProcessor(Site.me().setDomain("github.com")); + CompositePipeline pipeline = new CompositePipeline(); + + pageProcessor.setSubPageProcessors(githubRepoProcessor, githubUserProcessor); + pipeline.setSubPipeline(githubRepoProcessor, githubUserProcessor); + + Spider.create(pageProcessor).addUrl("https://github.com/code4craft").thread(5).addPipeline(pipeline).runAsync(); + } + +} diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/handler/CompositePageProcessor.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/handler/CompositePageProcessor.java index ecf4aa1d..20734456 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/handler/CompositePageProcessor.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/handler/CompositePageProcessor.java @@ -15,14 +15,18 @@ public class CompositePageProcessor implements PageProcessor { private Site site; - private List subPageProcessors; + private List subPageProcessors = new ArrayList(); + + public CompositePageProcessor(Site site) { + this.site = site; + } @Override public void process(Page page) { for (SubPageProcessor subPageProcessor : subPageProcessors) { - if (subPageProcessor.match(page)) { - SubPageProcessor.MatchOtherProcessor matchOtherProcessorProcessor = subPageProcessor.process(page); - if (matchOtherProcessorProcessor == null || matchOtherProcessorProcessor != SubPageProcessor.MatchOtherProcessor.YES) { + if (subPageProcessor.match(page.getRequest())) { + SubPageProcessor.MatchOther matchOtherProcessorProcessor = subPageProcessor.processPage(page); + if (matchOtherProcessorProcessor == null || matchOtherProcessorProcessor != SubPageProcessor.MatchOther.YES) { return; } } @@ -34,6 +38,11 @@ public class CompositePageProcessor implements PageProcessor { return this; } + public CompositePageProcessor addSubPageProcessor(SubPageProcessor subPageProcessor) { + this.subPageProcessors.add(subPageProcessor); + return this; + } + public CompositePageProcessor setSubPageProcessors(SubPageProcessor... subPageProcessors) { this.subPageProcessors = new ArrayList(); for (SubPageProcessor subPageProcessor : subPageProcessors) { diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/handler/CompositePipeline.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/handler/CompositePipeline.java new file mode 100644 index 00000000..3f09eee2 --- /dev/null +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/handler/CompositePipeline.java @@ -0,0 +1,42 @@ +package us.codecraft.webmagic.handler; + +import us.codecraft.webmagic.ResultItems; +import us.codecraft.webmagic.Task; +import us.codecraft.webmagic.pipeline.Pipeline; + +import java.util.ArrayList; +import java.util.List; + +/** + * @author code4crafer@gmail.com + */ +public class CompositePipeline implements Pipeline { + + private List subPipelines = new ArrayList(); + + @Override + public void process(ResultItems resultItems, Task task) { + for (SubPipeline subPipeline : subPipelines) { + if (subPipeline.match(resultItems.getRequest())) { + RequestMatcher.MatchOther matchOtherProcessorProcessor = subPipeline.processResult(resultItems, task); + if (matchOtherProcessorProcessor == null || matchOtherProcessorProcessor != RequestMatcher.MatchOther.YES) { + return; + } + } + } + } + + public CompositePipeline addSubPipeline(SubPipeline subPipeline) { + this.subPipelines.add(subPipeline); + return this; + } + + public CompositePipeline setSubPipeline(SubPipeline... subPipelines) { + this.subPipelines = new ArrayList(); + for (SubPipeline subPipeline : subPipelines) { + this.subPipelines.add(subPipeline); + } + return this; + } + +} diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/handler/PatternHandler.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/handler/PatternHandler.java deleted file mode 100644 index 4be03de1..00000000 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/handler/PatternHandler.java +++ /dev/null @@ -1,90 +0,0 @@ -package us.codecraft.webmagic.handler; - -import us.codecraft.webmagic.Page; -import us.codecraft.webmagic.ResultItems; -import us.codecraft.webmagic.Task; - -import java.util.UUID; - -/** - * Created with IntelliJ IDEA. - * User: Sebastian MA - * Date: April 03, 2014 - * Time: 10:00 - *

- * A PatternHandler is in charge of both page extraction and data processing by implementing - * its two abstract methods. - */ -public abstract class PatternHandler implements SubPageProcessor { - - /** - * identity of the handler. - */ - protected String id; - - /** - * match pattern. only matched page should be handled. - */ - protected String pattern; - - /** - * @param pattern - * url pattern to handle - */ - protected PatternHandler(String pattern) { - - this.pattern = pattern; - this.id = UUID.randomUUID().toString(); - } - - /** - * determine if the page should be handled. - */ - public boolean match(String url) { - - return url.matches(pattern); - } - - public boolean processPage(Page page) { - - if(match(page.getUrl().toString())) { - page.putField(id, true); - process(page); - return true; - } else { - return false; - } - } - - public boolean processResult(ResultItems resultItems, Task task) { - - if(resultItems.isSkip()) { - return false; - } - - if(match(resultItems.getRequest().getUrl()) && resultItems.get(id) != null) { - handle(resultItems, task); - return true; - } else { - return false; - } - } - - /** - * override this method to handle the extraction result. this method MUST use - * with PatternPipeline - * - * @param result - * extraction result - * @param task - */ - public void handle(ResultItems result, Task task) { - - } - - @Override - public boolean match(Page page) { - - return match(page.getUrl().toString()); - } -} diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/handler/PatternProcessor.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/handler/PatternProcessor.java new file mode 100644 index 00000000..f9ef286b --- /dev/null +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/handler/PatternProcessor.java @@ -0,0 +1,13 @@ +package us.codecraft.webmagic.handler; + +/** + * @author code4crafer@gmail.com + */ +public abstract class PatternProcessor extends PatternRequestMatcher implements SubPipeline, SubPageProcessor { + /** + * @param pattern url pattern to handle + */ + public PatternProcessor(String pattern) { + super(pattern); + } +} diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/handler/PatternRequestMatcher.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/handler/PatternRequestMatcher.java new file mode 100644 index 00000000..5c0f31a2 --- /dev/null +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/handler/PatternRequestMatcher.java @@ -0,0 +1,37 @@ +package us.codecraft.webmagic.handler; + +import us.codecraft.webmagic.Request; + +import java.util.regex.Pattern; + +/** + * Created with IntelliJ IDEA. + * User: Sebastian MA + * Date: April 03, 2014 + * Time: 10:00 + *

+ * A PatternHandler is in charge of both page extraction and data processing by implementing + * its two abstract methods. + */ +public abstract class PatternRequestMatcher implements RequestMatcher { + + /** + * match pattern. only matched page should be handled. + */ + protected String pattern; + + private Pattern patternCompiled; + + /** + * @param pattern url pattern to handle + */ + public PatternRequestMatcher(String pattern) { + this.pattern = pattern; + this.patternCompiled = Pattern.compile(pattern); + } + + @Override + public boolean match(Request request) { + return patternCompiled.matcher(request.getUrl()).find(); + } +} diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/handler/RequestMatcher.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/handler/RequestMatcher.java new file mode 100644 index 00000000..31b9a787 --- /dev/null +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/handler/RequestMatcher.java @@ -0,0 +1,24 @@ +package us.codecraft.webmagic.handler; + +import us.codecraft.webmagic.Request; + +/** + * @author code4crafer@gmail.com + * @since 0.5.0 + */ +public interface RequestMatcher { + + /** + * Check whether to process the page.

+ * Please DO NOT change page status in this method. + * + * @param page + * + * @return + */ + public boolean match(Request page); + + public enum MatchOther { + YES, NO + } +} diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/handler/SubPageProcessor.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/handler/SubPageProcessor.java index 3778a620..1b6e2830 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/handler/SubPageProcessor.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/handler/SubPageProcessor.java @@ -6,17 +6,7 @@ import us.codecraft.webmagic.Page; * @author code4crafter@gmail.com * @date 14-4-5 */ -public interface SubPageProcessor { - - /** - * Check whether the SubPageProcessor can process the page.

- * Please DO NOT change page status in this method. - * - * @param page - * - * @return - */ - public boolean match(Page page); +public interface SubPageProcessor extends RequestMatcher { /** * process the page, extract urls to fetch, extract the data and store @@ -25,10 +15,6 @@ public interface SubPageProcessor { * * @return whether continue to match */ - public MatchOtherProcessor process(Page page); - - public enum MatchOtherProcessor { - YES, NO - } + public MatchOther processPage(Page page); } diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/handler/SubPipeline.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/handler/SubPipeline.java new file mode 100644 index 00000000..40456089 --- /dev/null +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/handler/SubPipeline.java @@ -0,0 +1,21 @@ +package us.codecraft.webmagic.handler; + +import us.codecraft.webmagic.ResultItems; +import us.codecraft.webmagic.Task; + +/** + * @author code4crafer@gmail.com + * @since 0.5.0 + */ +public interface SubPipeline extends RequestMatcher { + + /** + * process the page, extract urls to fetch, extract the data and store + * + * @param page + * @param task + * @return whether continue to match + */ + public MatchOther processResult(ResultItems resultItems, Task task); + +} diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/PatternPipeline.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/PatternPipeline.java deleted file mode 100644 index c614114d..00000000 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/PatternPipeline.java +++ /dev/null @@ -1,43 +0,0 @@ -package us.codecraft.webmagic.pipeline; - -import us.codecraft.webmagic.ResultItems; -import us.codecraft.webmagic.Task; -import us.codecraft.webmagic.handler.PatternHandler; - -import java.util.ArrayList; - -/** - * Created with IntelliJ IDEA. - * User: Sebastian MA - * Date: April 04, 2014 - * Time: 20:44 - */ -public class PatternPipeline implements Pipeline { - - protected ArrayList handlers = new ArrayList(); - - /** - * A handler works only if it is added to BOTH the page processor and the pipeline. - * Uses PatternHandler's register instead. - * - * @param handler the pattern handler - * - */ - public void addHandler(PatternHandler handler) { - - handlers.add(handler); - } - - public void removeHandler(PatternHandler handler) { - - handlers.remove(handler); - } - - @Override - public void process(ResultItems resultItems, Task task) { - - for(PatternHandler handler : handlers) { - handler.processResult(resultItems, task); - } - } -} diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/processor/PatternPageProcessor.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/processor/PatternPageProcessor.java deleted file mode 100644 index 51dbabe3..00000000 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/processor/PatternPageProcessor.java +++ /dev/null @@ -1,76 +0,0 @@ -package us.codecraft.webmagic.processor; - -import us.codecraft.webmagic.Page; -import us.codecraft.webmagic.Site; -import us.codecraft.webmagic.handler.PatternHandler; -import us.codecraft.webmagic.utils.UrlUtils; - -import java.util.ArrayList; -import java.util.List; - -/** - * Created with IntelliJ IDEA. - * User: Sebastian MA - * Date: April 04, 2014 - * Time: 15:36 - *

- * A PatternPageProcessor uses PatternHandler to setup extraction rules for specific url pattern. - * - * @see us.codecraft.webmagic.handler.PatternHandler - */ -public class PatternPageProcessor implements PageProcessor { - - public static final String TARGET_PATTERN_ALL = "http://*"; - - protected Site site; - - protected String targetPattern; - - protected ArrayList handlers = new ArrayList(); - - public PatternPageProcessor(String startUrl, String targetPattern) { - - this.targetPattern = targetPattern; - - this.site = Site.me().addStartUrl(startUrl).setDomain(UrlUtils.getDomain(startUrl)); - this.targetPattern = "(" + targetPattern.replace(".", "\\.").replace("*", - "[^\"'#]*") + ")"; - - site.setUserAgent("Chrome/5.0.354.0"); - } - - @Override - public void process(Page page) { - - - List requests = page.getHtml().links().regex(targetPattern).all(); - page.addTargetRequests(requests); - for(PatternHandler handler : handlers) { - if(handler.match(page.getUrl().toString())) { - handler.processPage(page); - } - } - } - - /** - * - * @param handler the pattern handler - * - * - */ - public void addHandler(PatternHandler handler) { - - handlers.add(handler); - } - - public void removeHandler(PatternHandler handler) { - - handlers.remove(handler); - } - - @Override - public Site getSite() { - - return site; - } -}