comments in english

pull/17/head
yihua.huang 12 years ago
parent e566a53936
commit 59aad6a7f4

@ -8,7 +8,7 @@ import java.util.Collection;
* Date: 13-8-4 <br> * Date: 13-8-4 <br>
* Time: 5:18 <br> * Time: 5:18 <br>
*/ */
public interface PagedModel { public interface MultiPageModel {
public String getPageKey(); public String getPageKey();
@ -16,6 +16,6 @@ public interface PagedModel {
public String getPage(); public String getPage();
public PagedModel combine(PagedModel pagedModel); public MultiPageModel combine(MultiPageModel multiPageModel);
} }

@ -17,7 +17,6 @@ import java.io.*;
/** /**
* Download file and saved to file for cache.<br> * Download file and saved to file for cache.<br>
* *
*
* @author code4crafter@gmail.com * @author code4crafter@gmail.com
* @since 0.2.1 * @since 0.2.1
*/ */

@ -0,0 +1,8 @@
package us.codecraft.webmagic.model.annotation;
/**
* @author code4crafter@gmail.com <br>
* Stands for features not stable.
*/
public @interface Experimental {
}

@ -14,29 +14,24 @@ import java.io.IOException;
import java.io.PrintWriter; import java.io.PrintWriter;
/** /**
* JSON<br> * Store results objects (page models) to files in JSON format<br>
* LANG=zh_CN.UTF-8<br> * Use model.getKey() as file name if the model implements HasKey.<br>
* Otherwise use SHA1 as file name.
* *
* @author code4crafter@gmail.com <br> * @author code4crafter@gmail.com <br>
* Date: 13-4-21 * @since 0.2.0
* Time: 6:28
*/ */
public class JsonFilePageModelPipeline extends FilePersistentBase implements PageModelPipeline { public class JsonFilePageModelPipeline extends FilePersistentBase implements PageModelPipeline {
private Logger logger = Logger.getLogger(getClass()); private Logger logger = Logger.getLogger(getClass());
/** /**
* JsonFilePageModelPipeline使"/data/webmagic/" * new JsonFilePageModelPipeline with default path "/data/webmagic/"
*/ */
public JsonFilePageModelPipeline() { public JsonFilePageModelPipeline() {
setPath("/data/webmagic/"); setPath("/data/webmagic/");
} }
/**
* JsonFilePageModelPipeline
*
* @param path
*/
public JsonFilePageModelPipeline(String path) { public JsonFilePageModelPipeline(String path) {
setPath(path); setPath(path);
} }
@ -47,7 +42,7 @@ public class JsonFilePageModelPipeline extends FilePersistentBase implements Pag
try { try {
String filename; String filename;
if (o instanceof HasKey) { if (o instanceof HasKey) {
filename = path + ((HasKey)o).key() + ".json"; filename = path + ((HasKey) o).key() + ".json";
} else { } else {
filename = path + DigestUtils.md5Hex(ToStringBuilder.reflectionToString(o)) + ".json"; filename = path + DigestUtils.md5Hex(ToStringBuilder.reflectionToString(o)) + ".json";
} }

@ -13,28 +13,22 @@ import java.io.IOException;
import java.io.PrintWriter; import java.io.PrintWriter;
/** /**
* JSON * Store results to files in JSON format<br>
* *
* @author code4crafter@gmail.com <br> * @author code4crafter@gmail.com <br>
* Date: 13-4-21 * @since 0.2.0
* Time: 6:28
*/ */
public class JsonFilePipeline extends FilePersistentBase implements Pipeline { public class JsonFilePipeline extends FilePersistentBase implements Pipeline {
private Logger logger = Logger.getLogger(getClass()); private Logger logger = Logger.getLogger(getClass());
/** /**
* JsonFilePipeline使"/data/webmagic/" * new JsonFilePageModelPipeline with default path "/data/webmagic/"
*/ */
public JsonFilePipeline() { public JsonFilePipeline() {
setPath("/data/webmagic"); setPath("/data/webmagic");
} }
/**
* JsonFilePipeline
*
* @param path
*/
public JsonFilePipeline(String path) { public JsonFilePipeline(String path) {
setPath(path); setPath(path);
} }

@ -1,26 +1,28 @@
package us.codecraft.webmagic.pipeline; package us.codecraft.webmagic.pipeline;
import us.codecraft.webmagic.PagedModel; import us.codecraft.webmagic.MultiPageModel;
import us.codecraft.webmagic.ResultItems; import us.codecraft.webmagic.ResultItems;
import us.codecraft.webmagic.Task; import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.model.annotation.Experimental;
import us.codecraft.webmagic.utils.DoubleKeyMap; import us.codecraft.webmagic.utils.DoubleKeyMap;
import java.util.*; import java.util.*;
import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.ConcurrentHashMap;
/** /**
* Pipeline<br> * A pipeline combines the result in more than one page together.<br>
* 使redis使<br> * Used for news and articles containing more than one web page. <br>
* MultiPagePipeline will store parts of object and output them when all parts are extracted.<br>
* *
* @author code4crafter@gmail.com <br> * @author code4crafter@gmail.com <br>
* Date: 13-8-4 <br> * @since 0.2.0
* Time: 5:15 <br>
*/ */
public class PagedPipeline implements Pipeline { @Experimental
public class MultiPagePipeline implements Pipeline {
private DoubleKeyMap<String, String, Boolean> pageMap = new DoubleKeyMap<String, String, Boolean>(ConcurrentHashMap.class); private DoubleKeyMap<String, String, Boolean> pageMap = new DoubleKeyMap<String, String, Boolean>(ConcurrentHashMap.class);
private DoubleKeyMap<String, String, PagedModel> objectMap = new DoubleKeyMap<String, String, PagedModel>(ConcurrentHashMap.class); private DoubleKeyMap<String, String, MultiPageModel> objectMap = new DoubleKeyMap<String, String, MultiPageModel>(ConcurrentHashMap.class);
@Override @Override
public void process(ResultItems resultItems, Task task) { public void process(ResultItems resultItems, Task task) {
@ -34,20 +36,20 @@ public class PagedPipeline implements Pipeline {
private void handleObject(Iterator<Map.Entry<String, Object>> iterator) { private void handleObject(Iterator<Map.Entry<String, Object>> iterator) {
Map.Entry<String, Object> objectEntry = iterator.next(); Map.Entry<String, Object> objectEntry = iterator.next();
Object o = objectEntry.getValue(); Object o = objectEntry.getValue();
if (o instanceof PagedModel) { if (o instanceof MultiPageModel) {
PagedModel pagedModel = (PagedModel) o; MultiPageModel multiPageModel = (MultiPageModel) o;
pageMap.put(pagedModel.getPageKey(), pagedModel.getPage(), Boolean.TRUE); pageMap.put(multiPageModel.getPageKey(), multiPageModel.getPage(), Boolean.TRUE);
if (pagedModel.getOtherPages() != null) { if (multiPageModel.getOtherPages() != null) {
for (String otherPage : pagedModel.getOtherPages()) { for (String otherPage : multiPageModel.getOtherPages()) {
Boolean aBoolean = pageMap.get(pagedModel.getPageKey(), otherPage); Boolean aBoolean = pageMap.get(multiPageModel.getPageKey(), otherPage);
if (aBoolean == null) { if (aBoolean == null) {
pageMap.put(pagedModel.getPageKey(), otherPage, Boolean.FALSE); pageMap.put(multiPageModel.getPageKey(), otherPage, Boolean.FALSE);
} }
} }
} }
//check if all pages are processed //check if all pages are processed
Map<String, Boolean> booleanMap = pageMap.get(pagedModel.getPageKey()); Map<String, Boolean> booleanMap = pageMap.get(multiPageModel.getPageKey());
objectMap.put(pagedModel.getPageKey(), pagedModel.getPage(), pagedModel); objectMap.put(multiPageModel.getPageKey(), multiPageModel.getPage(), multiPageModel);
if (booleanMap == null) { if (booleanMap == null) {
return; return;
} }
@ -57,12 +59,12 @@ public class PagedPipeline implements Pipeline {
return; return;
} }
} }
List<Map.Entry<String, PagedModel>> entryList = new ArrayList<Map.Entry<String, PagedModel>>(); List<Map.Entry<String, MultiPageModel>> entryList = new ArrayList<Map.Entry<String, MultiPageModel>>();
entryList.addAll(objectMap.get(pagedModel.getPageKey()).entrySet()); entryList.addAll(objectMap.get(multiPageModel.getPageKey()).entrySet());
if (entryList.size() != 0) { if (entryList.size() != 0) {
Collections.sort(entryList, new Comparator<Map.Entry<String, PagedModel>>() { Collections.sort(entryList, new Comparator<Map.Entry<String, MultiPageModel>>() {
@Override @Override
public int compare(Map.Entry<String, PagedModel> o1, Map.Entry<String, PagedModel> o2) { public int compare(Map.Entry<String, MultiPageModel> o1, Map.Entry<String, MultiPageModel> o2) {
try { try {
int i1 = Integer.parseInt(o1.getKey()); int i1 = Integer.parseInt(o1.getKey());
int i2 = Integer.parseInt(o2.getKey()); int i2 = Integer.parseInt(o2.getKey());
@ -72,7 +74,7 @@ public class PagedPipeline implements Pipeline {
} }
} }
}); });
PagedModel value = entryList.get(0).getValue(); MultiPageModel value = entryList.get(0).getValue();
for (int i = 1; i < entryList.size(); i++) { for (int i = 1; i < entryList.size(); i++) {
value = value.combine(entryList.get(i).getValue()); value = value.combine(entryList.get(i).getValue());
} }

@ -16,10 +16,10 @@ import java.util.concurrent.atomic.AtomicBoolean;
import java.util.concurrent.atomic.AtomicInteger; import java.util.concurrent.atomic.AtomicInteger;
/** /**
* url<br> * Store urls and cursor in files so that a Spider can resume the status when shutdown<br>
*
* @author code4crafter@gmail.com <br> * @author code4crafter@gmail.com <br>
* Date: 13-4-21 * @since 0.2.0
* Time: 1:13
*/ */
public class FileCacheQueueScheduler implements Scheduler { public class FileCacheQueueScheduler implements Scheduler {
@ -46,8 +46,8 @@ public class FileCacheQueueScheduler implements Scheduler {
private Set<String> urls; private Set<String> urls;
public FileCacheQueueScheduler(String filePath) { public FileCacheQueueScheduler(String filePath) {
if (!filePath.endsWith("/")&&!filePath.endsWith("\\")){ if (!filePath.endsWith("/") && !filePath.endsWith("\\")) {
filePath+="/"; filePath += "/";
} }
this.filePath = filePath; this.filePath = filePath;
} }
@ -95,7 +95,7 @@ public class FileCacheQueueScheduler implements Scheduler {
readCursorFile(); readCursorFile();
readUrlFile(); readUrlFile();
} catch (IOException e) { } catch (IOException e) {
logger.error("init file error",e); logger.error("init file error", e);
} }
} }

@ -9,11 +9,10 @@ import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Task; import us.codecraft.webmagic.Task;
/** /**
* 使redisurl<br> * Use Redis as url scheduler for distributed crawlers<br>
* *
* @author code4crafter@gmail.com <br> * @author code4crafter@gmail.com <br>
* Date: 13-7-25 <br> * @since 0.2.0
* Time: 7:07 <br>
*/ */
public class RedisScheduler implements Scheduler { public class RedisScheduler implements Scheduler {

@ -6,9 +6,8 @@ import java.util.ArrayList;
import java.util.List; import java.util.List;
/** /**
* JsonPath
* @author code4crafter@gmail.com <br> * @author code4crafter@gmail.com <br>
* Date: 13-8-12 <br>
* Time: 12:54 <br>
*/ */
public class JsonPathSelector implements Selector { public class JsonPathSelector implements Selector {

@ -4,7 +4,6 @@ import java.util.Map;
/** /**
* @author code4crafter@gmail.com * @author code4crafter@gmail.com
* Date Dec 14, 2012
*/ */
public class DoubleKeyMap<K1, K2, V> extends MultiKeyMapBase { public class DoubleKeyMap<K1, K2, V> extends MultiKeyMapBase {
private Map<K1, Map<K2, V>> map; private Map<K1, Map<K2, V>> map;

@ -9,7 +9,7 @@ import java.util.HashMap;
import java.util.Map; import java.util.Map;
/** /**
* multikey map, some basic objects * * multi-key map, some basic objects *
* *
* @author yihua.huang * @author yihua.huang
*/ */

@ -9,8 +9,6 @@ import us.codecraft.webmagic.Task;
/** /**
* @author code4crafter@gmail.com <br> * @author code4crafter@gmail.com <br>
* Date: 13-7-25 <br>
* Time: 7:51 <br>
*/ */
public class RedisSchedulerTest { public class RedisSchedulerTest {

@ -7,8 +7,6 @@ import java.util.List;
/** /**
* @author code4crafter@gmai.com <br> * @author code4crafter@gmai.com <br>
* Date: 13-8-12 <br>
* Time: 1:12 <br>
*/ */
public class JsonPathSelectorTest { public class JsonPathSelectorTest {

@ -6,7 +6,7 @@ import us.codecraft.webmagic.model.samples.IteyeBlog;
import us.codecraft.webmagic.model.samples.News163; import us.codecraft.webmagic.model.samples.News163;
import us.codecraft.webmagic.model.samples.OschinaBlog; import us.codecraft.webmagic.model.samples.OschinaBlog;
import us.codecraft.webmagic.pipeline.ConsolePipeline; import us.codecraft.webmagic.pipeline.ConsolePipeline;
import us.codecraft.webmagic.pipeline.PagedPipeline; import us.codecraft.webmagic.pipeline.MultiPagePipeline;
import java.util.LinkedHashMap; import java.util.LinkedHashMap;
import java.util.Map; import java.util.Map;
@ -40,7 +40,7 @@ public class QuickStarter {
key = readKey(key); key = readKey(key);
System.out.println("The demo started and will last 20 seconds..."); System.out.println("The demo started and will last 20 seconds...");
//Start spider //Start spider
OOSpider.create(Site.me().addStartUrl(urlMap.get(key)), clazzMap.get(key)).pipeline(new PagedPipeline()).pipeline(new ConsolePipeline()).runAsync(); OOSpider.create(Site.me().addStartUrl(urlMap.get(key)), clazzMap.get(key)).pipeline(new MultiPagePipeline()).pipeline(new ConsolePipeline()).runAsync();
try { try {
Thread.sleep(20000); Thread.sleep(20000);

@ -1,6 +1,6 @@
package us.codecraft.webmagic.model.samples; package us.codecraft.webmagic.model.samples;
import us.codecraft.webmagic.PagedModel; import us.codecraft.webmagic.MultiPageModel;
import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.model.OOSpider; import us.codecraft.webmagic.model.OOSpider;
import us.codecraft.webmagic.model.annotation.ComboExtract; import us.codecraft.webmagic.model.annotation.ComboExtract;
@ -8,7 +8,7 @@ import us.codecraft.webmagic.model.annotation.ExtractBy;
import us.codecraft.webmagic.model.annotation.ExtractByUrl; import us.codecraft.webmagic.model.annotation.ExtractByUrl;
import us.codecraft.webmagic.model.annotation.TargetUrl; import us.codecraft.webmagic.model.annotation.TargetUrl;
import us.codecraft.webmagic.pipeline.ConsolePipeline; import us.codecraft.webmagic.pipeline.ConsolePipeline;
import us.codecraft.webmagic.pipeline.PagedPipeline; import us.codecraft.webmagic.pipeline.MultiPagePipeline;
import us.codecraft.webmagic.scheduler.RedisScheduler; import us.codecraft.webmagic.scheduler.RedisScheduler;
import java.util.Collection; import java.util.Collection;
@ -20,7 +20,7 @@ import java.util.List;
* Time: 8:17 <br> * Time: 8:17 <br>
*/ */
@TargetUrl("http://news.163.com/\\d+/\\d+/\\d+/\\w+*.html") @TargetUrl("http://news.163.com/\\d+/\\d+/\\d+/\\w+*.html")
public class News163 implements PagedModel { public class News163 implements MultiPageModel {
@ExtractByUrl("http://news\\.163\\.com/\\d+/\\d+/\\d+/([^_]*).*\\.html") @ExtractByUrl("http://news\\.163\\.com/\\d+/\\d+/\\d+/([^_]*).*\\.html")
private String pageKey; private String pageKey;
@ -58,10 +58,10 @@ public class News163 implements PagedModel {
} }
@Override @Override
public PagedModel combine(PagedModel pagedModel) { public MultiPageModel combine(MultiPageModel multiPageModel) {
News163 news163 = new News163(); News163 news163 = new News163();
news163.title = this.title; news163.title = this.title;
News163 pagedModel1 = (News163) pagedModel; News163 pagedModel1 = (News163) multiPageModel;
news163.content = this.content + pagedModel1.content; news163.content = this.content + pagedModel1.content;
return news163; return news163;
} }
@ -77,7 +77,7 @@ public class News163 implements PagedModel {
public static void main(String[] args) { public static void main(String[] args) {
OOSpider.create(Site.me().addStartUrl("http://news.163.com/13/0802/05/958I1E330001124J_2.html"), News163.class) OOSpider.create(Site.me().addStartUrl("http://news.163.com/13/0802/05/958I1E330001124J_2.html"), News163.class)
.scheduler(new RedisScheduler("localhost")).clearPipeline().pipeline(new PagedPipeline()).pipeline(new ConsolePipeline()).run(); .scheduler(new RedisScheduler("localhost")).clearPipeline().pipeline(new MultiPagePipeline()).pipeline(new ConsolePipeline()).run();
} }
} }

@ -4,7 +4,7 @@
<date-generated>Sat Aug 17 14:14:45 CST 2013</date-generated> <date-generated>Sat Aug 17 14:14:45 CST 2013</date-generated>
</meta> </meta>
<comment> <comment>
<key><![CDATA[us.codecraft.webmagic.PagedModel]]></key> <key><![CDATA[us.codecraft.webmagic.MultiPageModel]]></key>
<data><![CDATA[ 实现此接口以进行支持爬虫分页抓取。<br> <data><![CDATA[ 实现此接口以进行支持爬虫分页抓取。<br>
@author code4crafter@gmail.com <br> @author code4crafter@gmail.com <br>
Date: 13-8-4 <br> Date: 13-8-4 <br>

@ -4,7 +4,7 @@
<date-generated>Sat Aug 17 14:14:46 CST 2013</date-generated> <date-generated>Sat Aug 17 14:14:46 CST 2013</date-generated>
</meta> </meta>
<comment> <comment>
<key><![CDATA[us.codecraft.webmagic.pipeline.PagedPipeline]]></key> <key><![CDATA[us.codecraft.webmagic.pipeline.MultiPagePipeline]]></key>
<data><![CDATA[ 用于实现分页的Pipeline。<br> <data><![CDATA[ 用于实现分页的Pipeline。<br>
在使用redis做分布式爬虫时请不要使用此功能。<br> 在使用redis做分布式爬虫时请不要使用此功能。<br>

Loading…
Cancel
Save