|
|
|
@ -1,26 +1,28 @@
|
|
|
|
|
package us.codecraft.webmagic.pipeline;
|
|
|
|
|
|
|
|
|
|
import us.codecraft.webmagic.PagedModel;
|
|
|
|
|
import us.codecraft.webmagic.MultiPageModel;
|
|
|
|
|
import us.codecraft.webmagic.ResultItems;
|
|
|
|
|
import us.codecraft.webmagic.Task;
|
|
|
|
|
import us.codecraft.webmagic.model.annotation.Experimental;
|
|
|
|
|
import us.codecraft.webmagic.utils.DoubleKeyMap;
|
|
|
|
|
|
|
|
|
|
import java.util.*;
|
|
|
|
|
import java.util.concurrent.ConcurrentHashMap;
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* 用于实现分页的Pipeline。<br>
|
|
|
|
|
* 在使用redis做分布式爬虫时,请不要使用此功能。<br>
|
|
|
|
|
* A pipeline combines the result in more than one page together.<br>
|
|
|
|
|
* Used for news and articles containing more than one web page. <br>
|
|
|
|
|
* MultiPagePipeline will store parts of object and output them when all parts are extracted.<br>
|
|
|
|
|
*
|
|
|
|
|
* @author code4crafter@gmail.com <br>
|
|
|
|
|
* Date: 13-8-4 <br>
|
|
|
|
|
* Time: 下午5:15 <br>
|
|
|
|
|
* @since 0.2.0
|
|
|
|
|
*/
|
|
|
|
|
public class PagedPipeline implements Pipeline {
|
|
|
|
|
@Experimental
|
|
|
|
|
public class MultiPagePipeline implements Pipeline {
|
|
|
|
|
|
|
|
|
|
private DoubleKeyMap<String, String, Boolean> pageMap = new DoubleKeyMap<String, String, Boolean>(ConcurrentHashMap.class);
|
|
|
|
|
|
|
|
|
|
private DoubleKeyMap<String, String, PagedModel> objectMap = new DoubleKeyMap<String, String, PagedModel>(ConcurrentHashMap.class);
|
|
|
|
|
private DoubleKeyMap<String, String, MultiPageModel> objectMap = new DoubleKeyMap<String, String, MultiPageModel>(ConcurrentHashMap.class);
|
|
|
|
|
|
|
|
|
|
@Override
|
|
|
|
|
public void process(ResultItems resultItems, Task task) {
|
|
|
|
@ -34,20 +36,20 @@ public class PagedPipeline implements Pipeline {
|
|
|
|
|
private void handleObject(Iterator<Map.Entry<String, Object>> iterator) {
|
|
|
|
|
Map.Entry<String, Object> objectEntry = iterator.next();
|
|
|
|
|
Object o = objectEntry.getValue();
|
|
|
|
|
if (o instanceof PagedModel) {
|
|
|
|
|
PagedModel pagedModel = (PagedModel) o;
|
|
|
|
|
pageMap.put(pagedModel.getPageKey(), pagedModel.getPage(), Boolean.TRUE);
|
|
|
|
|
if (pagedModel.getOtherPages() != null) {
|
|
|
|
|
for (String otherPage : pagedModel.getOtherPages()) {
|
|
|
|
|
Boolean aBoolean = pageMap.get(pagedModel.getPageKey(), otherPage);
|
|
|
|
|
if (o instanceof MultiPageModel) {
|
|
|
|
|
MultiPageModel multiPageModel = (MultiPageModel) o;
|
|
|
|
|
pageMap.put(multiPageModel.getPageKey(), multiPageModel.getPage(), Boolean.TRUE);
|
|
|
|
|
if (multiPageModel.getOtherPages() != null) {
|
|
|
|
|
for (String otherPage : multiPageModel.getOtherPages()) {
|
|
|
|
|
Boolean aBoolean = pageMap.get(multiPageModel.getPageKey(), otherPage);
|
|
|
|
|
if (aBoolean == null) {
|
|
|
|
|
pageMap.put(pagedModel.getPageKey(), otherPage, Boolean.FALSE);
|
|
|
|
|
pageMap.put(multiPageModel.getPageKey(), otherPage, Boolean.FALSE);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
//check if all pages are processed
|
|
|
|
|
Map<String, Boolean> booleanMap = pageMap.get(pagedModel.getPageKey());
|
|
|
|
|
objectMap.put(pagedModel.getPageKey(), pagedModel.getPage(), pagedModel);
|
|
|
|
|
Map<String, Boolean> booleanMap = pageMap.get(multiPageModel.getPageKey());
|
|
|
|
|
objectMap.put(multiPageModel.getPageKey(), multiPageModel.getPage(), multiPageModel);
|
|
|
|
|
if (booleanMap == null) {
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
@ -57,12 +59,12 @@ public class PagedPipeline implements Pipeline {
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
List<Map.Entry<String, PagedModel>> entryList = new ArrayList<Map.Entry<String, PagedModel>>();
|
|
|
|
|
entryList.addAll(objectMap.get(pagedModel.getPageKey()).entrySet());
|
|
|
|
|
List<Map.Entry<String, MultiPageModel>> entryList = new ArrayList<Map.Entry<String, MultiPageModel>>();
|
|
|
|
|
entryList.addAll(objectMap.get(multiPageModel.getPageKey()).entrySet());
|
|
|
|
|
if (entryList.size() != 0) {
|
|
|
|
|
Collections.sort(entryList, new Comparator<Map.Entry<String, PagedModel>>() {
|
|
|
|
|
Collections.sort(entryList, new Comparator<Map.Entry<String, MultiPageModel>>() {
|
|
|
|
|
@Override
|
|
|
|
|
public int compare(Map.Entry<String, PagedModel> o1, Map.Entry<String, PagedModel> o2) {
|
|
|
|
|
public int compare(Map.Entry<String, MultiPageModel> o1, Map.Entry<String, MultiPageModel> o2) {
|
|
|
|
|
try {
|
|
|
|
|
int i1 = Integer.parseInt(o1.getKey());
|
|
|
|
|
int i2 = Integer.parseInt(o2.getKey());
|
|
|
|
@ -72,7 +74,7 @@ public class PagedPipeline implements Pipeline {
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
});
|
|
|
|
|
PagedModel value = entryList.get(0).getValue();
|
|
|
|
|
MultiPageModel value = entryList.get(0).getValue();
|
|
|
|
|
for (int i = 1; i < entryList.size(); i++) {
|
|
|
|
|
value = value.combine(entryList.get(i).getValue());
|
|
|
|
|
}
|