add paged support

pull/17/head
yihua.huang 12 years ago
parent a5c85c3c8b
commit 619a12b303

@ -283,6 +283,11 @@ public class Spider implements Runnable, Task {
return this;
}
public Spider clearPipeline(){
pipelines=new ArrayList<Pipeline>();
return this;
}
@Override
public String getUUID() {
if (uuid != null) {

@ -29,7 +29,6 @@ public class ConsolePipeline implements Pipeline{
} else {
System.out.println(entry.getKey() + ":\t" + entry.getValue());
}
System.out.println(entry.getKey()+":\t"+entry.getValue());
}
}
}

@ -0,0 +1,111 @@
package us.codecraft.webmagic.utils;
import java.util.Map;
/**
* @author yihua.huang@dianping.com
* @date Dec 14, 2012
*/
public class DoubleKeyMap<K1, K2, V> extends MultiKeyMapBase {
private Map<K1, Map<K2, V>> map;
public DoubleKeyMap() {
init();
}
public DoubleKeyMap(Map<K1, Map<K2, V>> map) {
this(map,DEFAULT_CLAZZ);
}
public DoubleKeyMap(Class<? extends Map> protoMapClass) {
super(protoMapClass);
init();
}
private void init() {
if (map == null) {
map = this.<K1, Map<K2, V>>newMap();
}
}
/**
* init map with protoMapClass
*
* @param protoMapClass
*/
@SuppressWarnings("rawtypes")
public DoubleKeyMap(Map<K1, Map<K2, V>> map, Class<? extends Map> protoMapClass) {
super(protoMapClass);
this.map = map;
init();
}
/**
* @param key
* @return
*/
public Map<K2, V> get(K1 key) {
return map.get(key);
}
/**
* @param key1
* @param key2
* @return
*/
public V get(K1 key1, K2 key2) {
if (get(key1) == null) {
return null;
}
return get(key1).get(key2);
}
/**
* @param key1
* @param submap
* @return
*/
public V put(K1 key1, Map<K2, V> submap) {
return put(key1, submap);
}
/**
* @param key1
* @param key2
* @param value
* @return
*/
public V put(K1 key1, K2 key2, V value) {
if (map.get(key1) == null) {
map.put(key1, this.<K2, V>newMap());
}
return get(key1).put(key2, value);
}
/**
* @param key1
* @param key2
* @return
*/
public V remove(K1 key1, K2 key2) {
if (get(key1) == null) {
return null;
}
V remove = get(key1).remove(key2);
// 如果上一级map为空把它也回收掉
if (get(key1).size() == 0) {
remove(key1);
}
return remove;
}
/**
* @param key1
* @return
*/
public Map<K2, V> remove(K1 key1) {
Map<K2, V> remove = map.remove(key1);
return remove;
}
}

@ -0,0 +1,42 @@
package us.codecraft.webmagic.utils;
/**
* @author yihua.huang@dianping.com
* @date Dec 14, 2012
*/
import java.util.HashMap;
import java.util.Map;
/**
* multikey map, some basic objects *
*
* @author yihua.huang
*/
public abstract class MultiKeyMapBase {
protected static final Class<? extends Map> DEFAULT_CLAZZ = HashMap.class;
@SuppressWarnings("rawtypes")
private Class<? extends Map> protoMapClass = DEFAULT_CLAZZ;
public MultiKeyMapBase() {
}
@SuppressWarnings("rawtypes")
public MultiKeyMapBase(Class<? extends Map> protoMapClass) {
this.protoMapClass = protoMapClass;
}
@SuppressWarnings("unchecked")
protected <K, V2> Map<K, V2> newMap() {
try {
return (Map<K, V2>) protoMapClass.newInstance();
} catch (InstantiationException e) {
throw new IllegalArgumentException("wrong proto type map "
+ protoMapClass);
} catch (IllegalAccessException e) {
throw new IllegalArgumentException("wrong proto type map "
+ protoMapClass);
}
}
}

@ -1,13 +0,0 @@
package us.codecraft.webmagic.model;
/**
* @author code4crafter@gmail.com <br>
* @date: 13-8-1 <br>
* Time: 10:18 <br>
*/
@TargetUrl("http://my.oschina.net/flashsword/blog/*")
public class OschinaBlogComment {
}

@ -0,0 +1,20 @@
package us.codecraft.webmagic;
import java.util.Collection;
/**
* @author yihua.huang@dianping.com <br>
* @date: 13-8-4 <br>
* Time: 5:18 <br>
*/
public interface PagedModel {
public String getPageKey();
public Collection<String> getOtherPages();
public String getPage();
public PagedModel combine(PagedModel pagedModel);
}

@ -0,0 +1,78 @@
package us.codecraft.webmagic.pipeline;
import us.codecraft.webmagic.PagedModel;
import us.codecraft.webmagic.ResultItems;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.utils.DoubleKeyMap;
import java.util.*;
import java.util.concurrent.ConcurrentHashMap;
/**
* @author yihua.huang@dianping.com <br>
* @date: 13-8-4 <br>
* Time: 5:15 <br>
*/
public class PagedPipeline implements Pipeline {
private DoubleKeyMap<String, String, Boolean> pageMap = new DoubleKeyMap<String, String, Boolean>(ConcurrentHashMap.class);
private DoubleKeyMap<String, String, PagedModel> objectMap = new DoubleKeyMap<String, String, PagedModel>(ConcurrentHashMap.class);
@Override
public void process(ResultItems resultItems, Task task) {
Map<String, Object> resultItemsAll = resultItems.getAll();
Iterator<Map.Entry<String, Object>> iterator = resultItemsAll.entrySet().iterator();
while (iterator.hasNext()) {
handleObject(iterator);
}
}
private void handleObject(Iterator<Map.Entry<String, Object>> iterator) {
Map.Entry<String, Object> objectEntry = iterator.next();
Object o = objectEntry.getValue();
if (o instanceof PagedModel) {
PagedModel pagedModel = (PagedModel) o;
for (String otherPage : pagedModel.getOtherPages()) {
Boolean aBoolean = pageMap.get(pagedModel.getPageKey(), otherPage);
if (aBoolean == null) {
pageMap.put(pagedModel.getPageKey(), otherPage, Boolean.FALSE);
}
}
//check if all pages are processed
Map<String, Boolean> booleanMap = pageMap.get(pagedModel.getPageKey());
objectMap.put(pagedModel.getPageKey(), pagedModel.getPage(), pagedModel);
if (booleanMap == null) {
return;
}
for (Map.Entry<String, Boolean> stringBooleanEntry : booleanMap.entrySet()) {
if (!stringBooleanEntry.getValue()) {
iterator.remove();
return;
}
}
List<Map.Entry<String, PagedModel>> entryList = new ArrayList<Map.Entry<String, PagedModel>>();
entryList.addAll(objectMap.get(pagedModel.getPageKey()).entrySet());
if (entryList.size() != 0) {
Collections.sort(entryList, new Comparator<Map.Entry<String, PagedModel>>() {
@Override
public int compare(Map.Entry<String, PagedModel> o1, Map.Entry<String, PagedModel> o2) {
try {
int i1 = Integer.parseInt(o1.getKey());
int i2 = Integer.parseInt(o2.getKey());
return i1 - i2;
} catch (NumberFormatException e) {
return o1.getKey().compareTo(o2.getKey());
}
}
});
PagedModel value = entryList.get(0).getValue();
for (int i=1;i<entryList.size();i++){
value=value.combine(entryList.get(i).getValue());
}
objectEntry.setValue(value);
}
}
}
}

@ -0,0 +1,81 @@
package us.codecraft.webmagic.model.samples;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.PagedModel;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.model.*;
import us.codecraft.webmagic.pipeline.ConsolePipeline;
import us.codecraft.webmagic.pipeline.PagedPipeline;
import us.codecraft.webmagic.selector.Selectable;
import java.util.Collection;
import java.util.List;
/**
* @author yihua.huang@dianping.com <br>
* @date: 13-8-4 <br>
* Time: 8:17 <br>
*/
@TargetUrl("http://news.163.com/\\d+/\\d+/\\d+/\\w+*.html")
public class News163 implements PagedModel, AfterExtractor {
@ExtractByUrl("http://news\\.163\\.com/\\d+/\\d+/\\d+/(\\w+)*\\.html")
private String pageKey;
@ExtractByUrl(value = "http://news\\.163\\.com/\\d+/\\d+/\\d+/\\w+_(\\d+)\\.html", notNull = false)
private String page;
private List<String> otherPage;
@ExtractBy("//h1[@id=\"h1title\"]/text()")
private String title;
@ExtractBy("//div[@id=\"epContentLeft\"]")
private String content;
@Override
public String getPageKey() {
return pageKey;
}
@Override
public Collection<String> getOtherPages() {
return otherPage;
}
@Override
public String getPage() {
if (page == null) {
return "0";
}
return page;
}
@Override
public PagedModel combine(PagedModel pagedModel) {
News163 news163 = new News163();
News163 pagedModel1 = (News163) pagedModel;
news163.content = this.content + pagedModel1.content;
return news163;
}
@Override
public String toString() {
return "News163{" +
"content='" + content + '\'' +
", title='" + title + '\'' +
", otherPage=" + otherPage +
'}';
}
public static void main(String[] args) {
OOSpider.create(Site.me().addStartUrl("http://news.163.com/13/0802/05/958I1E330001124J_2.html"), News163.class)
.clearPipeline().pipeline(new PagedPipeline()).pipeline(new ConsolePipeline()).run();
}
@Override
public void afterProcess(Page page) {
Selectable xpath = page.getHtml().xpath("//div[@class=\"ep-pages\"]//a/@href");
otherPage = xpath.regex("http://news\\.163\\.com/\\d+/\\d+/\\d+/\\w+_(\\d+)\\.html").all();
}
}
Loading…
Cancel
Save