From 36494bcfa52d58157117a25c504e73de6b15e1da Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Tue, 6 Aug 2013 23:01:43 +0800 Subject: [PATCH] add xpath2.0 api --- .../webmagic/pipeline/ConsolePipeline.java | 21 +++++++------------ .../us/codecraft/webmagic/selector/Html.java | 6 ++++++ .../webmagic/selector/PlainText.java | 5 +++++ .../webmagic/selector/Selectable.java | 8 +++++++ .../webmagic/selector/SelectorFactory.java | 4 ++++ .../webmagic/downloader/FileDownloader.java | 0 .../scheduler}/FileCacheQueueScheduler.java | 3 ++- .../webmagic/utils/DoubleKeyMap.java | 0 .../webmagic/utils/MultiKeyMapBase.java | 0 .../webmagic/model/samples/OschinaBlog.java | 3 +-- .../webmagic/samples/GuoxueProcessor.java | 2 +- .../us/codecraft/webmagic/SpiderTest.java | 2 +- .../processor/DiandianProcessorTest.java | 2 +- .../processor/DiaoyuwengProcessorTest.java | 2 +- .../processor/SinablogProcessorTest.java | 2 +- 15 files changed, 38 insertions(+), 22 deletions(-) rename {webmagic-core => webmagic-plugin/webmagic-misc}/src/main/java/us/codecraft/webmagic/downloader/FileDownloader.java (100%) rename {webmagic-core/src/main/java/us/codecraft/webmagic/schedular => webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/scheduler}/FileCacheQueueScheduler.java (97%) rename {webmagic-core => webmagic-plugin/webmagic-misc}/src/main/java/us/codecraft/webmagic/utils/DoubleKeyMap.java (100%) rename {webmagic-core => webmagic-plugin/webmagic-misc}/src/main/java/us/codecraft/webmagic/utils/MultiKeyMapBase.java (100%) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/ConsolePipeline.java b/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/ConsolePipeline.java index 2ff99c87..e1648fe7 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/ConsolePipeline.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/ConsolePipeline.java @@ -7,25 +7,18 @@ import java.util.Map; /** * 命令行输出抽取结果。可用于测试。
+ * * @author code4crafter@gmail.com
- * Date: 13-4-21 - * Time: 下午1:45 + * Date: 13-4-21 + * Time: 下午1:45 */ -public class ConsolePipeline implements Pipeline{ +public class ConsolePipeline implements Pipeline { @Override - public void process(ResultItems resultItems,Task task) { - System.out.println("get page: "+resultItems.getRequest().getUrl()); + public void process(ResultItems resultItems, Task task) { + System.out.println("get page: " + resultItems.getRequest().getUrl()); for (Map.Entry entry : resultItems.getAll().entrySet()) { - if (entry.getValue() instanceof Iterable) { - Iterable value = (Iterable) entry.getValue(); - System.out.println(entry.getKey() + ":"); - for (Object o : value) { - System.out.println(o); - } - } else { - System.out.println(entry.getKey() + ":\t" + entry.getValue()); - } + System.out.println(entry.getKey() + ":\t" + entry.getValue()); } } } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java index 114eef99..79d62a01 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java @@ -63,6 +63,12 @@ public class Html extends PlainText { return selectList(xpathSelector, strings); } + @Override + public Selectable xpath2(String xpath) { + Xpath2Selector xpathSelector = SelectorFactory.getInstatnce().newXpath2Selector(xpath); + return selectList(xpathSelector, strings); + } + @Override public Selectable $(String selector) { CssSelector cssSelector = new CssSelector(selector); diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/PlainText.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/PlainText.java index d06a5310..4fff6da8 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/PlainText.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/PlainText.java @@ -34,6 +34,11 @@ public class PlainText implements Selectable { throw new UnsupportedOperationException(); } + @Override + public Selectable xpath2(String xpath) { + throw new UnsupportedOperationException(); + } + @Override public Selectable $(String selector) { throw new UnsupportedOperationException(); diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectable.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectable.java index 42f3d108..cea501dd 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectable.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectable.java @@ -18,6 +18,14 @@ public interface Selectable { */ public Selectable xpath(String xpath); + /** + * select list with xpath 2.0 syntax + * + * @param xpath + * @return new Selectable after extract + */ + public Selectable xpath2(String xpath); + /** * select list with css selector * diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/SelectorFactory.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/SelectorFactory.java index 1dd56e01..9abb1ce3 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/SelectorFactory.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/SelectorFactory.java @@ -34,6 +34,10 @@ public class SelectorFactory { return newSelector(XpathSelector.class, xpath); } + public Xpath2Selector newXpath2Selector(String xpath) { + return newSelector(Xpath2Selector.class, xpath); + } + public SmartContentSelector newSmartContentSelector(){ return newSelector(SmartContentSelector.class); } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/FileDownloader.java b/webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/downloader/FileDownloader.java similarity index 100% rename from webmagic-core/src/main/java/us/codecraft/webmagic/downloader/FileDownloader.java rename to webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/downloader/FileDownloader.java diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/schedular/FileCacheQueueScheduler.java b/webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/scheduler/FileCacheQueueScheduler.java similarity index 97% rename from webmagic-core/src/main/java/us/codecraft/webmagic/schedular/FileCacheQueueScheduler.java rename to webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/scheduler/FileCacheQueueScheduler.java index f5393a33..d4a3987d 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/schedular/FileCacheQueueScheduler.java +++ b/webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/scheduler/FileCacheQueueScheduler.java @@ -1,9 +1,10 @@ -package us.codecraft.webmagic.schedular; +package us.codecraft.webmagic.scheduler; import org.apache.commons.lang3.math.NumberUtils; import org.apache.log4j.Logger; import us.codecraft.webmagic.Request; import us.codecraft.webmagic.Task; +import us.codecraft.webmagic.schedular.Scheduler; import java.io.*; import java.util.LinkedHashSet; diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/utils/DoubleKeyMap.java b/webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/utils/DoubleKeyMap.java similarity index 100% rename from webmagic-core/src/main/java/us/codecraft/webmagic/utils/DoubleKeyMap.java rename to webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/utils/DoubleKeyMap.java diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/utils/MultiKeyMapBase.java b/webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/utils/MultiKeyMapBase.java similarity index 100% rename from webmagic-core/src/main/java/us/codecraft/webmagic/utils/MultiKeyMapBase.java rename to webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/utils/MultiKeyMapBase.java diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/OschinaBlog.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/OschinaBlog.java index 38cb41f0..817ba448 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/OschinaBlog.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/OschinaBlog.java @@ -4,7 +4,6 @@ import us.codecraft.webmagic.Site; import us.codecraft.webmagic.model.ExtractBy; import us.codecraft.webmagic.model.OOSpider; import us.codecraft.webmagic.model.TargetUrl; -import us.codecraft.webmagic.pipeline.ConsolePipeline; import us.codecraft.webmagic.pipeline.JsonFilePipeline; /** @@ -30,7 +29,7 @@ public class OschinaBlog implements Blog{ } public static void main(String[] args) { - OOSpider.create(Site.me().addStartUrl("http://my.oschina.net/flashsword/blog"), OschinaBlog.class).pipeline(new ConsolePipeline()).pipeline(new JsonFilePipeline()).run(); + OOSpider.create(Site.me().addStartUrl("http://my.oschina.net/flashsword/blog"), OschinaBlog.class).pipeline(new JsonFilePipeline()).run(); } public String getTitle() { diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/GuoxueProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/GuoxueProcessor.java index db00c79c..5d7d3559 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/GuoxueProcessor.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/GuoxueProcessor.java @@ -3,7 +3,7 @@ package us.codecraft.webmagic.samples; import us.codecraft.webmagic.Spider; import us.codecraft.webmagic.pipeline.FilePipeline; import us.codecraft.webmagic.processor.SimplePageProcessor; -import us.codecraft.webmagic.schedular.FileCacheQueueScheduler; +import us.codecraft.webmagic.scheduler.FileCacheQueueScheduler; /** * @author code4crafter@gmail.com
diff --git a/webmagic-samples/src/test/java/us/codecraft/webmagic/SpiderTest.java b/webmagic-samples/src/test/java/us/codecraft/webmagic/SpiderTest.java index 76a423fb..dbfa8154 100644 --- a/webmagic-samples/src/test/java/us/codecraft/webmagic/SpiderTest.java +++ b/webmagic-samples/src/test/java/us/codecraft/webmagic/SpiderTest.java @@ -5,7 +5,7 @@ import org.junit.Test; import us.codecraft.webmagic.pipeline.FilePipeline; import us.codecraft.webmagic.processor.SimplePageProcessor; import us.codecraft.webmagic.samples.HuxiuProcessor; -import us.codecraft.webmagic.schedular.FileCacheQueueScheduler; +import us.codecraft.webmagic.scheduler.FileCacheQueueScheduler; /** * @author code4crafter@gmail.com
diff --git a/webmagic-samples/src/test/java/us/codecraft/webmagic/processor/DiandianProcessorTest.java b/webmagic-samples/src/test/java/us/codecraft/webmagic/processor/DiandianProcessorTest.java index 13910b52..cf587f1d 100644 --- a/webmagic-samples/src/test/java/us/codecraft/webmagic/processor/DiandianProcessorTest.java +++ b/webmagic-samples/src/test/java/us/codecraft/webmagic/processor/DiandianProcessorTest.java @@ -6,7 +6,7 @@ import us.codecraft.webmagic.Spider; import us.codecraft.webmagic.pipeline.ConsolePipeline; import us.codecraft.webmagic.pipeline.FreemarkerPipeline; import us.codecraft.webmagic.samples.DiandianBlogProcessor; -import us.codecraft.webmagic.schedular.FileCacheQueueScheduler; +import us.codecraft.webmagic.scheduler.FileCacheQueueScheduler; import java.io.IOException; diff --git a/webmagic-samples/src/test/java/us/codecraft/webmagic/processor/DiaoyuwengProcessorTest.java b/webmagic-samples/src/test/java/us/codecraft/webmagic/processor/DiaoyuwengProcessorTest.java index 33bcf9c6..69a535c6 100644 --- a/webmagic-samples/src/test/java/us/codecraft/webmagic/processor/DiaoyuwengProcessorTest.java +++ b/webmagic-samples/src/test/java/us/codecraft/webmagic/processor/DiaoyuwengProcessorTest.java @@ -6,7 +6,7 @@ import us.codecraft.webmagic.Spider; import us.codecraft.webmagic.pipeline.FilePipeline; import us.codecraft.webmagic.pipeline.FreemarkerPipeline; import us.codecraft.webmagic.samples.DiaoyuwengProcessor; -import us.codecraft.webmagic.schedular.FileCacheQueueScheduler; +import us.codecraft.webmagic.scheduler.FileCacheQueueScheduler; import java.io.IOException; diff --git a/webmagic-samples/src/test/java/us/codecraft/webmagic/processor/SinablogProcessorTest.java b/webmagic-samples/src/test/java/us/codecraft/webmagic/processor/SinablogProcessorTest.java index a0160e18..a44fe35b 100644 --- a/webmagic-samples/src/test/java/us/codecraft/webmagic/processor/SinablogProcessorTest.java +++ b/webmagic-samples/src/test/java/us/codecraft/webmagic/processor/SinablogProcessorTest.java @@ -6,7 +6,7 @@ import us.codecraft.webmagic.Spider; import us.codecraft.webmagic.pipeline.FilePipeline; import us.codecraft.webmagic.pipeline.FreemarkerPipeline; import us.codecraft.webmagic.samples.SinaBlogProcesser; -import us.codecraft.webmagic.schedular.FileCacheQueueScheduler; +import us.codecraft.webmagic.scheduler.FileCacheQueueScheduler; import java.io.IOException;