add paged support
parent
a5c85c3c8b
commit
619a12b303
@ -0,0 +1,42 @@
|
||||
package us.codecraft.webmagic.utils;
|
||||
|
||||
/**
|
||||
* @author yihua.huang@dianping.com
|
||||
* @date Dec 14, 2012
|
||||
*/
|
||||
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* multikey map, some basic objects *
|
||||
*
|
||||
* @author yihua.huang
|
||||
*/
|
||||
public abstract class MultiKeyMapBase {
|
||||
|
||||
protected static final Class<? extends Map> DEFAULT_CLAZZ = HashMap.class;
|
||||
@SuppressWarnings("rawtypes")
|
||||
private Class<? extends Map> protoMapClass = DEFAULT_CLAZZ;
|
||||
|
||||
public MultiKeyMapBase() {
|
||||
}
|
||||
|
||||
@SuppressWarnings("rawtypes")
|
||||
public MultiKeyMapBase(Class<? extends Map> protoMapClass) {
|
||||
this.protoMapClass = protoMapClass;
|
||||
}
|
||||
|
||||
@SuppressWarnings("unchecked")
|
||||
protected <K, V2> Map<K, V2> newMap() {
|
||||
try {
|
||||
return (Map<K, V2>) protoMapClass.newInstance();
|
||||
} catch (InstantiationException e) {
|
||||
throw new IllegalArgumentException("wrong proto type map "
|
||||
+ protoMapClass);
|
||||
} catch (IllegalAccessException e) {
|
||||
throw new IllegalArgumentException("wrong proto type map "
|
||||
+ protoMapClass);
|
||||
}
|
||||
}
|
||||
}
|
@ -1,13 +0,0 @@
|
||||
package us.codecraft.webmagic.model;
|
||||
|
||||
/**
|
||||
* @author code4crafter@gmail.com <br>
|
||||
* @date: 13-8-1 <br>
|
||||
* Time: 下午10:18 <br>
|
||||
*/
|
||||
@TargetUrl("http://my.oschina.net/flashsword/blog/*")
|
||||
public class OschinaBlogComment {
|
||||
|
||||
|
||||
|
||||
}
|
@ -0,0 +1,20 @@
|
||||
package us.codecraft.webmagic;
|
||||
|
||||
import java.util.Collection;
|
||||
|
||||
/**
|
||||
* @author yihua.huang@dianping.com <br>
|
||||
* @date: 13-8-4 <br>
|
||||
* Time: 下午5:18 <br>
|
||||
*/
|
||||
public interface PagedModel {
|
||||
|
||||
public String getPageKey();
|
||||
|
||||
public Collection<String> getOtherPages();
|
||||
|
||||
public String getPage();
|
||||
|
||||
public PagedModel combine(PagedModel pagedModel);
|
||||
|
||||
}
|
@ -0,0 +1,78 @@
|
||||
package us.codecraft.webmagic.pipeline;
|
||||
|
||||
import us.codecraft.webmagic.PagedModel;
|
||||
import us.codecraft.webmagic.ResultItems;
|
||||
import us.codecraft.webmagic.Task;
|
||||
import us.codecraft.webmagic.utils.DoubleKeyMap;
|
||||
|
||||
import java.util.*;
|
||||
import java.util.concurrent.ConcurrentHashMap;
|
||||
|
||||
/**
|
||||
* @author yihua.huang@dianping.com <br>
|
||||
* @date: 13-8-4 <br>
|
||||
* Time: 下午5:15 <br>
|
||||
*/
|
||||
public class PagedPipeline implements Pipeline {
|
||||
|
||||
private DoubleKeyMap<String, String, Boolean> pageMap = new DoubleKeyMap<String, String, Boolean>(ConcurrentHashMap.class);
|
||||
|
||||
private DoubleKeyMap<String, String, PagedModel> objectMap = new DoubleKeyMap<String, String, PagedModel>(ConcurrentHashMap.class);
|
||||
|
||||
@Override
|
||||
public void process(ResultItems resultItems, Task task) {
|
||||
Map<String, Object> resultItemsAll = resultItems.getAll();
|
||||
Iterator<Map.Entry<String, Object>> iterator = resultItemsAll.entrySet().iterator();
|
||||
while (iterator.hasNext()) {
|
||||
handleObject(iterator);
|
||||
}
|
||||
}
|
||||
|
||||
private void handleObject(Iterator<Map.Entry<String, Object>> iterator) {
|
||||
Map.Entry<String, Object> objectEntry = iterator.next();
|
||||
Object o = objectEntry.getValue();
|
||||
if (o instanceof PagedModel) {
|
||||
PagedModel pagedModel = (PagedModel) o;
|
||||
for (String otherPage : pagedModel.getOtherPages()) {
|
||||
Boolean aBoolean = pageMap.get(pagedModel.getPageKey(), otherPage);
|
||||
if (aBoolean == null) {
|
||||
pageMap.put(pagedModel.getPageKey(), otherPage, Boolean.FALSE);
|
||||
}
|
||||
}
|
||||
//check if all pages are processed
|
||||
Map<String, Boolean> booleanMap = pageMap.get(pagedModel.getPageKey());
|
||||
objectMap.put(pagedModel.getPageKey(), pagedModel.getPage(), pagedModel);
|
||||
if (booleanMap == null) {
|
||||
return;
|
||||
}
|
||||
for (Map.Entry<String, Boolean> stringBooleanEntry : booleanMap.entrySet()) {
|
||||
if (!stringBooleanEntry.getValue()) {
|
||||
iterator.remove();
|
||||
return;
|
||||
}
|
||||
}
|
||||
List<Map.Entry<String, PagedModel>> entryList = new ArrayList<Map.Entry<String, PagedModel>>();
|
||||
entryList.addAll(objectMap.get(pagedModel.getPageKey()).entrySet());
|
||||
if (entryList.size() != 0) {
|
||||
Collections.sort(entryList, new Comparator<Map.Entry<String, PagedModel>>() {
|
||||
@Override
|
||||
public int compare(Map.Entry<String, PagedModel> o1, Map.Entry<String, PagedModel> o2) {
|
||||
try {
|
||||
int i1 = Integer.parseInt(o1.getKey());
|
||||
int i2 = Integer.parseInt(o2.getKey());
|
||||
return i1 - i2;
|
||||
} catch (NumberFormatException e) {
|
||||
return o1.getKey().compareTo(o2.getKey());
|
||||
}
|
||||
}
|
||||
});
|
||||
PagedModel value = entryList.get(0).getValue();
|
||||
for (int i=1;i<entryList.size();i++){
|
||||
value=value.combine(entryList.get(i).getValue());
|
||||
}
|
||||
objectEntry.setValue(value);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
@ -0,0 +1,81 @@
|
||||
package us.codecraft.webmagic.model.samples;
|
||||
|
||||
import us.codecraft.webmagic.Page;
|
||||
import us.codecraft.webmagic.PagedModel;
|
||||
import us.codecraft.webmagic.Site;
|
||||
import us.codecraft.webmagic.model.*;
|
||||
import us.codecraft.webmagic.pipeline.ConsolePipeline;
|
||||
import us.codecraft.webmagic.pipeline.PagedPipeline;
|
||||
import us.codecraft.webmagic.selector.Selectable;
|
||||
|
||||
import java.util.Collection;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* @author yihua.huang@dianping.com <br>
|
||||
* @date: 13-8-4 <br>
|
||||
* Time: 下午8:17 <br>
|
||||
*/
|
||||
@TargetUrl("http://news.163.com/\\d+/\\d+/\\d+/\\w+*.html")
|
||||
public class News163 implements PagedModel, AfterExtractor {
|
||||
|
||||
@ExtractByUrl("http://news\\.163\\.com/\\d+/\\d+/\\d+/(\\w+)*\\.html")
|
||||
private String pageKey;
|
||||
|
||||
@ExtractByUrl(value = "http://news\\.163\\.com/\\d+/\\d+/\\d+/\\w+_(\\d+)\\.html", notNull = false)
|
||||
private String page;
|
||||
|
||||
private List<String> otherPage;
|
||||
|
||||
@ExtractBy("//h1[@id=\"h1title\"]/text()")
|
||||
private String title;
|
||||
|
||||
@ExtractBy("//div[@id=\"epContentLeft\"]")
|
||||
private String content;
|
||||
|
||||
@Override
|
||||
public String getPageKey() {
|
||||
return pageKey;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Collection<String> getOtherPages() {
|
||||
return otherPage;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String getPage() {
|
||||
if (page == null) {
|
||||
return "0";
|
||||
}
|
||||
return page;
|
||||
}
|
||||
|
||||
@Override
|
||||
public PagedModel combine(PagedModel pagedModel) {
|
||||
News163 news163 = new News163();
|
||||
News163 pagedModel1 = (News163) pagedModel;
|
||||
news163.content = this.content + pagedModel1.content;
|
||||
return news163;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "News163{" +
|
||||
"content='" + content + '\'' +
|
||||
", title='" + title + '\'' +
|
||||
", otherPage=" + otherPage +
|
||||
'}';
|
||||
}
|
||||
|
||||
public static void main(String[] args) {
|
||||
OOSpider.create(Site.me().addStartUrl("http://news.163.com/13/0802/05/958I1E330001124J_2.html"), News163.class)
|
||||
.clearPipeline().pipeline(new PagedPipeline()).pipeline(new ConsolePipeline()).run();
|
||||
}
|
||||
|
||||
@Override
|
||||
public void afterProcess(Page page) {
|
||||
Selectable xpath = page.getHtml().xpath("//div[@class=\"ep-pages\"]//a/@href");
|
||||
otherPage = xpath.regex("http://news\\.163\\.com/\\d+/\\d+/\\d+/\\w+_(\\d+)\\.html").all();
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue