From 619a12b3034b4038d1a2e730b967cc316213d834 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Sun, 4 Aug 2013 21:22:15 +0800 Subject: [PATCH] add paged support --- .../java/us/codecraft/webmagic/Spider.java | 5 + .../webmagic/pipeline/ConsolePipeline.java | 1 - .../webmagic/utils/DoubleKeyMap.java | 111 ++++++++++++++++++ .../webmagic/utils/MultiKeyMapBase.java | 42 +++++++ .../webmagic/model/OschinaBlogComment.java | 13 -- .../us/codecraft/webmagic/PagedModel.java | 20 ++++ .../webmagic/pipeline/PagedPipeline.java | 78 ++++++++++++ .../webmagic/model/samples/News163.java | 81 +++++++++++++ 8 files changed, 337 insertions(+), 14 deletions(-) create mode 100755 webmagic-core/src/main/java/us/codecraft/webmagic/utils/DoubleKeyMap.java create mode 100755 webmagic-core/src/main/java/us/codecraft/webmagic/utils/MultiKeyMapBase.java delete mode 100644 webmagic-core/src/test/java/us/codecraft/webmagic/model/OschinaBlogComment.java create mode 100644 webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/PagedModel.java create mode 100644 webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/pipeline/PagedPipeline.java create mode 100644 webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/News163.java diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java index a25fd024..414315c2 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java @@ -283,6 +283,11 @@ public class Spider implements Runnable, Task { return this; } + public Spider clearPipeline(){ + pipelines=new ArrayList(); + return this; + } + @Override public String getUUID() { if (uuid != null) { diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/ConsolePipeline.java b/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/ConsolePipeline.java index 97470e04..8f294745 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/ConsolePipeline.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/ConsolePipeline.java @@ -29,7 +29,6 @@ public class ConsolePipeline implements Pipeline{ } else { System.out.println(entry.getKey() + ":\t" + entry.getValue()); } - System.out.println(entry.getKey()+":\t"+entry.getValue()); } } } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/utils/DoubleKeyMap.java b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/DoubleKeyMap.java new file mode 100755 index 00000000..500573aa --- /dev/null +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/DoubleKeyMap.java @@ -0,0 +1,111 @@ +package us.codecraft.webmagic.utils; + +import java.util.Map; + +/** + * @author yihua.huang@dianping.com + * @date Dec 14, 2012 + */ +public class DoubleKeyMap extends MultiKeyMapBase { + private Map> map; + + public DoubleKeyMap() { + init(); + } + + public DoubleKeyMap(Map> map) { + this(map,DEFAULT_CLAZZ); + } + + public DoubleKeyMap(Class protoMapClass) { + super(protoMapClass); + init(); + } + + private void init() { + if (map == null) { + map = this.>newMap(); + } + } + + /** + * init map with protoMapClass + * + * @param protoMapClass + */ + @SuppressWarnings("rawtypes") + public DoubleKeyMap(Map> map, Class protoMapClass) { + super(protoMapClass); + this.map = map; + init(); + } + + /** + * @param key + * @return + */ + public Map get(K1 key) { + return map.get(key); + } + + /** + * @param key1 + * @param key2 + * @return + */ + public V get(K1 key1, K2 key2) { + if (get(key1) == null) { + return null; + } + return get(key1).get(key2); + } + + + /** + * @param key1 + * @param submap + * @return + */ + public V put(K1 key1, Map submap) { + return put(key1, submap); + } + + /** + * @param key1 + * @param key2 + * @param value + * @return + */ + public V put(K1 key1, K2 key2, V value) { + if (map.get(key1) == null) { + map.put(key1, this.newMap()); + } + return get(key1).put(key2, value); + } + + /** + * @param key1 + * @param key2 + * @return + */ + public V remove(K1 key1, K2 key2) { + if (get(key1) == null) { + return null; + } + V remove = get(key1).remove(key2); + // 如果上一级map为空,把它也回收掉 + if (get(key1).size() == 0) { + remove(key1); + } + return remove; + } + + /** + * @param key1 + * @return + */ + public Map remove(K1 key1) { + Map remove = map.remove(key1); + return remove; + } +} diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/utils/MultiKeyMapBase.java b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/MultiKeyMapBase.java new file mode 100755 index 00000000..e0b5c64a --- /dev/null +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/MultiKeyMapBase.java @@ -0,0 +1,42 @@ +package us.codecraft.webmagic.utils; + +/** + * @author yihua.huang@dianping.com + * @date Dec 14, 2012 + */ + +import java.util.HashMap; +import java.util.Map; + +/** + * multikey map, some basic objects * + * + * @author yihua.huang + */ +public abstract class MultiKeyMapBase { + + protected static final Class DEFAULT_CLAZZ = HashMap.class; + @SuppressWarnings("rawtypes") + private Class protoMapClass = DEFAULT_CLAZZ; + + public MultiKeyMapBase() { + } + + @SuppressWarnings("rawtypes") + public MultiKeyMapBase(Class protoMapClass) { + this.protoMapClass = protoMapClass; + } + + @SuppressWarnings("unchecked") + protected Map newMap() { + try { + return (Map) protoMapClass.newInstance(); + } catch (InstantiationException e) { + throw new IllegalArgumentException("wrong proto type map " + + protoMapClass); + } catch (IllegalAccessException e) { + throw new IllegalArgumentException("wrong proto type map " + + protoMapClass); + } + } +} \ No newline at end of file diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/model/OschinaBlogComment.java b/webmagic-core/src/test/java/us/codecraft/webmagic/model/OschinaBlogComment.java deleted file mode 100644 index a1e5843a..00000000 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/model/OschinaBlogComment.java +++ /dev/null @@ -1,13 +0,0 @@ -package us.codecraft.webmagic.model; - -/** - * @author code4crafter@gmail.com
- * @date: 13-8-1
- * Time: 下午10:18
- */ -@TargetUrl("http://my.oschina.net/flashsword/blog/*") -public class OschinaBlogComment { - - - -} \ No newline at end of file diff --git a/webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/PagedModel.java b/webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/PagedModel.java new file mode 100644 index 00000000..f18426a7 --- /dev/null +++ b/webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/PagedModel.java @@ -0,0 +1,20 @@ +package us.codecraft.webmagic; + +import java.util.Collection; + +/** + * @author yihua.huang@dianping.com
+ * @date: 13-8-4
+ * Time: 下午5:18
+ */ +public interface PagedModel { + + public String getPageKey(); + + public Collection getOtherPages(); + + public String getPage(); + + public PagedModel combine(PagedModel pagedModel); + +} diff --git a/webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/pipeline/PagedPipeline.java b/webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/pipeline/PagedPipeline.java new file mode 100644 index 00000000..cc71e5c6 --- /dev/null +++ b/webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/pipeline/PagedPipeline.java @@ -0,0 +1,78 @@ +package us.codecraft.webmagic.pipeline; + +import us.codecraft.webmagic.PagedModel; +import us.codecraft.webmagic.ResultItems; +import us.codecraft.webmagic.Task; +import us.codecraft.webmagic.utils.DoubleKeyMap; + +import java.util.*; +import java.util.concurrent.ConcurrentHashMap; + +/** + * @author yihua.huang@dianping.com
+ * @date: 13-8-4
+ * Time: 下午5:15
+ */ +public class PagedPipeline implements Pipeline { + + private DoubleKeyMap pageMap = new DoubleKeyMap(ConcurrentHashMap.class); + + private DoubleKeyMap objectMap = new DoubleKeyMap(ConcurrentHashMap.class); + + @Override + public void process(ResultItems resultItems, Task task) { + Map resultItemsAll = resultItems.getAll(); + Iterator> iterator = resultItemsAll.entrySet().iterator(); + while (iterator.hasNext()) { + handleObject(iterator); + } + } + + private void handleObject(Iterator> iterator) { + Map.Entry objectEntry = iterator.next(); + Object o = objectEntry.getValue(); + if (o instanceof PagedModel) { + PagedModel pagedModel = (PagedModel) o; + for (String otherPage : pagedModel.getOtherPages()) { + Boolean aBoolean = pageMap.get(pagedModel.getPageKey(), otherPage); + if (aBoolean == null) { + pageMap.put(pagedModel.getPageKey(), otherPage, Boolean.FALSE); + } + } + //check if all pages are processed + Map booleanMap = pageMap.get(pagedModel.getPageKey()); + objectMap.put(pagedModel.getPageKey(), pagedModel.getPage(), pagedModel); + if (booleanMap == null) { + return; + } + for (Map.Entry stringBooleanEntry : booleanMap.entrySet()) { + if (!stringBooleanEntry.getValue()) { + iterator.remove(); + return; + } + } + List> entryList = new ArrayList>(); + entryList.addAll(objectMap.get(pagedModel.getPageKey()).entrySet()); + if (entryList.size() != 0) { + Collections.sort(entryList, new Comparator>() { + @Override + public int compare(Map.Entry o1, Map.Entry o2) { + try { + int i1 = Integer.parseInt(o1.getKey()); + int i2 = Integer.parseInt(o2.getKey()); + return i1 - i2; + } catch (NumberFormatException e) { + return o1.getKey().compareTo(o2.getKey()); + } + } + }); + PagedModel value = entryList.get(0).getValue(); + for (int i=1;i + * @date: 13-8-4
+ * Time: 下午8:17
+ */ +@TargetUrl("http://news.163.com/\\d+/\\d+/\\d+/\\w+*.html") +public class News163 implements PagedModel, AfterExtractor { + + @ExtractByUrl("http://news\\.163\\.com/\\d+/\\d+/\\d+/(\\w+)*\\.html") + private String pageKey; + + @ExtractByUrl(value = "http://news\\.163\\.com/\\d+/\\d+/\\d+/\\w+_(\\d+)\\.html", notNull = false) + private String page; + + private List otherPage; + + @ExtractBy("//h1[@id=\"h1title\"]/text()") + private String title; + + @ExtractBy("//div[@id=\"epContentLeft\"]") + private String content; + + @Override + public String getPageKey() { + return pageKey; + } + + @Override + public Collection getOtherPages() { + return otherPage; + } + + @Override + public String getPage() { + if (page == null) { + return "0"; + } + return page; + } + + @Override + public PagedModel combine(PagedModel pagedModel) { + News163 news163 = new News163(); + News163 pagedModel1 = (News163) pagedModel; + news163.content = this.content + pagedModel1.content; + return news163; + } + + @Override + public String toString() { + return "News163{" + + "content='" + content + '\'' + + ", title='" + title + '\'' + + ", otherPage=" + otherPage + + '}'; + } + + public static void main(String[] args) { + OOSpider.create(Site.me().addStartUrl("http://news.163.com/13/0802/05/958I1E330001124J_2.html"), News163.class) + .clearPipeline().pipeline(new PagedPipeline()).pipeline(new ConsolePipeline()).run(); + } + + @Override + public void afterProcess(Page page) { + Selectable xpath = page.getHtml().xpath("//div[@class=\"ep-pages\"]//a/@href"); + otherPage = xpath.regex("http://news\\.163\\.com/\\d+/\\d+/\\d+/\\w+_(\\d+)\\.html").all(); + } +}