diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java
index a894269b..2516dd10 100644
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java
@@ -8,30 +8,19 @@ import java.util.ArrayList;
import java.util.List;
/**
- *
- * Page保存了上一次抓取的结果,并可定义待抓取的链接内容。
*
- * 主要方法:
- * {@link #getUrl()} 获取页面的Url
- * {@link #getHtml()} 获取页面的html内容
- * {@link #putField(String, Object)} 保存抽取的结果
- * {@link #getResultItems()} 获取抽取的结果,在 {@link us.codecraft.webmagic.pipeline.Pipeline} 中调用
- * {@link #addTargetRequests(java.util.List)} {@link #addTargetRequest(String)} 添加待抓取的链接
- *
- *
- *
- * Store extracted result and urls to be crawled.
- *
- * Main method:
- * {@link #getUrl()} get url of current page
- * {@link #getHtml()} get content of current page
- * {@link #putField(String, Object)} save extracted result
- * {@link #getResultItems()} get extract results to be used in {@link us.codecraft.webmagic.pipeline.Pipeline}
- * {@link #addTargetRequests(java.util.List)} {@link #addTargetRequest(String)} add urls to crawl
- *
- *
+ * Object storing extracted result and urls to be crawled.
+ * Main method:
+ * {@link #getUrl()} get url of current page
+ * {@link #getHtml()} get content of current page
+ * {@link #putField(String, Object)} save extracted result
+ * {@link #getResultItems()} get extract results to be used in {@link us.codecraft.webmagic.pipeline.Pipeline}
+ * {@link #addTargetRequests(java.util.List)} {@link #addTargetRequest(String)} add urls to crawl
*
* @author code4crafter@gmail.com
+ * @since 0.1.0
+ * @see us.codecraft.webmagic.downloader.Downloader
+ * @see us.codecraft.webmagic.processor.PageProcessor
*/
public class Page {
@@ -55,19 +44,19 @@ public class Page {
}
/**
+ * store extract results
*
- *
- * @param key 结果的key
- * @param field 结果的value
+ * @param key
+ * @param field
*/
public void putField(String key, Object field) {
resultItems.put(key, field);
}
/**
- * 获取页面的html内容
+ * get html content of page
*
- * @return html 页面的html内容
+ * @return html
*/
public Selectable getHtml() {
return html;
@@ -82,9 +71,9 @@ public class Page {
}
/**
- * 添加待抓取的链接
+ * add urls to crawl
*
- * @param requests 待抓取的链接
+ * @param requests
*/
public void addTargetRequests(List requests) {
synchronized (targetRequests) {
@@ -99,9 +88,9 @@ public class Page {
}
/**
- * 添加待抓取的链接
+ * add url to crawl
*
- * @param requestString 待抓取的链接
+ * @param requestString
*/
public void addTargetRequest(String requestString) {
if (StringUtils.isBlank(requestString) || requestString.equals("#")) {
@@ -114,9 +103,9 @@ public class Page {
}
/**
- * 添加待抓取的页面,在需要传递附加信息时使用
+ * add requests to crawl
*
- * @param request 待抓取的页面
+ * @param request
*/
public void addTargetRequest(Request request) {
synchronized (targetRequests) {
@@ -125,27 +114,22 @@ public class Page {
}
/**
- * 获取页面的Url
+ * get url of current page
*
- * @return url 当前页面的url,可用于抽取
+ * @return url of current page
*/
public Selectable getUrl() {
return url;
}
- /**
- * 设置url
- *
- * @param url
- */
public void setUrl(Selectable url) {
this.url = url;
}
/**
- * 获取抓取请求
+ * get request of current page
*
- * @return request 抓取请求
+ * @return request
*/
public Request getRequest() {
return request;
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java
index b9b8ddf6..fd7f60c9 100644
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java
@@ -1,33 +1,17 @@
package us.codecraft.webmagic;
+import us.codecraft.webmagic.utils.Experimental;
+
import java.io.Serializable;
import java.util.HashMap;
import java.util.Map;
/**
- *
- * Request对象封装了待抓取的url信息。
- * 在PageProcessor中,Request对象可以通过{@link us.codecraft.webmagic.Page#getRequest()} 获取。
- *
- * Request对象包含一个extra属性,可以写入一些必须的上下文,这个特性在某些场合会有用。
- *
- * Example:
- * 抓取${linktext}时,希望提取链接link,并保存linktext的信息。
- * 在上一个页面:
- * public void process(Page page){
- * Request request = new Request(link,linktext);
- * page.addTargetRequest(request)
- * }
- * 在下一个页面:
- * public void process(Page page){
- * String linktext = (String)page.getRequest().getExtra()[0];
- * }
- *
- *
+ * Object contains url to crawl.
+ * It contains some additional information.
*
* @author code4crafter@gmail.com
- * Date: 13-4-21
- * Time: 上午11:37
+ * @since 0.1.0
*/
public class Request implements Serializable {
@@ -36,20 +20,22 @@ public class Request implements Serializable {
private String url;
/**
- * 额外参数,可以保存一些需要的上下文信息
+ * Store additional information in extras.
*/
private Map extras;
+ /**
+ * Priority of the request.
+ * The bigger will be processed earlier.
+ * Need a scheduler supporting priority.
+ * But no scheduler in webmagic supporting priority now (:
+ */
+ @Experimental
private double priority;
public Request() {
}
- /**
- * 构建一个request对象
- *
- * @param url 必须参数,待抓取的url
- */
public Request(String url) {
this.url = url;
}
@@ -59,12 +45,14 @@ public class Request implements Serializable {
}
/**
- * 设置优先级,用于URL队列排序
- * 需扩展Scheduler
- * 目前还没有对应支持优先级的Scheduler实现 =。=
- * @param priority 优先级,越大则越靠前
+ * Set the priority of request for sorting.
+ * Need a scheduler supporting priority.
+ * But no scheduler in webmagic supporting priority now (:
+ *
+ * @param priority
* @return this
*/
+ @Experimental
public Request setPriority(double priority) {
this.priority = priority;
return this;
@@ -85,11 +73,6 @@ public class Request implements Serializable {
return this;
}
- /**
- * 获取待抓取的url
- *
- * @return url 待抓取的url
- */
public String getUrl() {
return url;
}
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/ResultItems.java b/webmagic-core/src/main/java/us/codecraft/webmagic/ResultItems.java
index 7a8e5c39..e0552709 100644
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/ResultItems.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/ResultItems.java
@@ -4,10 +4,13 @@ import java.util.HashMap;
import java.util.Map;
/**
- * 保存抽取结果的类,由PageProcessor处理得到,传递给{@link us.codecraft.webmagic.pipeline.Pipeline}进行持久化。
+ * Object contains extract results.
+ * It is contained in Page and will be processed in pipeline.
+ *
* @author code4crafter@gmail.com
- * Date: 13-7-25
- * Time: 下午12:20
+ * @since 0.1.0
+ * @see Page
+ * @see us.codecraft.webmagic.pipeline.Pipeline
*/
public class ResultItems {
@@ -25,7 +28,7 @@ public class ResultItems {
return (T) fields.get(key);
}
- public Map getAll() {
+ public Map getAll() {
return fields;
}
@@ -44,8 +47,10 @@ public class ResultItems {
}
/**
- * 是否忽略这个页面,用于pipeline来判断是否对这个页面进行处理
- * @return 是否忽略 true 忽略
+ * Whether to skip the result.
+ * Result which is skipped will not be processed by Pipeline.
+ *
+ * @return whether to skip the result
*/
public boolean isSkip() {
return skip;
@@ -53,8 +58,10 @@ public class ResultItems {
/**
- * 设置是否忽略这个页面,用于pipeline来判断是否对这个页面进行处理
- * @param skip
+ * Set whether to skip the result.
+ * Result which is skipped will not be processed by Pipeline.
+ *
+ * @param skip whether to skip the result
* @return this
*/
public ResultItems setSkip(boolean skip) {
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java
index 9ab97fe8..443f2bba 100644
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java
@@ -5,12 +5,11 @@ import us.codecraft.webmagic.utils.UrlUtils;
import java.util.*;
/**
- * Site定义一个待抓取的站点的各种信息。
- * 这个类的所有getter方法,一般都只会被爬虫框架内部进行调用。
+ * Object contains setting for crawler.
*
* @author code4crafter@gmail.com
- * Date: 13-4-21
- * Time: 下午12:13
+ * @since 0.1.0
+ * @see us.codecraft.webmagic.processor.PageProcessor
*/
public class Site {
@@ -22,6 +21,9 @@ public class Site {
private String charset;
+ /**
+ * startUrls is the urls the crawler to start with.
+ */
private List startUrls = new ArrayList();
private int sleepTime = 3000;
@@ -37,19 +39,19 @@ public class Site {
}
/**
- * 创建一个Site对象,等价于new Site()
+ * new a Site
*
- * @return 新建的对象
+ * @return new site
*/
public static Site me() {
return new Site();
}
/**
- * 为这个站点添加一个cookie,可用于抓取某些需要登录访问的站点。这个cookie的域名与{@link #getDomain()}是一致的
+ * Add a cookie with domain {@link #getDomain()}
*
- * @param name cookie的名称
- * @param value cookie的值
+ * @param name
+ * @param value
* @return this
*/
public Site addCookie(String name, String value) {
@@ -58,7 +60,7 @@ public class Site {
}
/**
- * 为这个站点设置user-agent,很多网站都对user-agent进行了限制,不设置此选项可能会得到期望之外的结果。
+ * set user agent
*
* @param userAgent userAgent
* @return this
@@ -69,27 +71,27 @@ public class Site {
}
/**
- * 获取已经设置的所有cookie
+ * get cookies
*
- * @return 已经设置的所有cookie
+ * @return get cookies
*/
public Map getCookies() {
return cookies;
}
/**
- * 获取已设置的user-agent
+ * get user agent
*
- * @return 已设置的user-agent
+ * @return user agent
*/
public String getUserAgent() {
return userAgent;
}
/**
- * 获取已设置的domain
+ * get domain
*
- * @return 已设置的domain
+ * @return get domain
*/
public String getDomain() {
if (domain == null) {
@@ -101,10 +103,9 @@ public class Site {
}
/**
- * 设置这个站点所在域名,必须项。
- * 目前不支持多个域名的抓取。抓取多个域名请新建一个Spider。
+ * set the domain of site.
*
- * @param domain 爬虫会抓取的域名
+ * @param domain
* @return this
*/
public Site setDomain(String domain) {
@@ -113,10 +114,10 @@ public class Site {
}
/**
- * 设置页面编码,若不设置则自动根据Html meta信息获取。
- * 一般无需设置encoding,如果发现下载的结果是乱码,则可以设置此项。
+ * Set charset of page manually.
+ * When charset is not set or set to null, it can be auto detected by Http header.
*
- * @param charset 编码格式,主要是"utf-8"、"gbk"两种
+ * @param charset
* @return this
*/
public Site setCharset(String charset) {
@@ -125,20 +126,21 @@ public class Site {
}
/**
- * 获取已设置的编码
+ * get charset set manually
*
- * @return 已设置的domain
+ * @return charset
*/
public String getCharset() {
return charset;
}
/**
- * 设置可接受的http状态码,仅当状态码在这个集合中时,才会读取页面内容。
- * 默认为200,正常情况下,无须设置此项。
- * 某些站点会错误的返回状态码,此时可以对这个选项进行设置。
+ * Set acceptStatCode.
+ * When status code of http response is in acceptStatCodes, it will be processed.
+ * {200} by default.
+ * It is not necessarily to be set.
*
- * @param acceptStatCode 可接受的状态码
+ * @param acceptStatCode
* @return this
*/
public Site setAcceptStatCode(Set acceptStatCode) {
@@ -147,27 +149,27 @@ public class Site {
}
/**
- * 获取可接受的状态码
+ * get acceptStatCode
*
- * @return 可接受的状态码
+ * @return acceptStatCode
*/
public Set getAcceptStatCode() {
return acceptStatCode;
}
/**
- * 获取初始页面的地址列表
+ * get start urls
*
- * @return 初始页面的地址列表
+ * @return start urls
*/
public List getStartUrls() {
return startUrls;
}
/**
- * 增加初始页面的地址,可反复调用此方法增加多个初始地址。
+ * Add a url to start url.
*
- * @param startUrl 初始页面的地址
+ * @param startUrl
* @return this
*/
public Site addStartUrl(String startUrl) {
@@ -176,9 +178,10 @@ public class Site {
}
/**
- * 设置两次抓取之间的间隔,避免对目标站点压力过大(或者避免被防火墙屏蔽...)。
+ * Set the interval between the processing of two pages.
+ * Time unit is micro seconds.
*
- * @param sleepTime 单位毫秒
+ * @param sleepTime
* @return this
*/
public Site setSleepTime(int sleepTime) {
@@ -187,25 +190,26 @@ public class Site {
}
/**
- * 获取两次抓取之间的间隔
+ * Get the interval between the processing of two pages.
+ * Time unit is micro seconds.
*
- * @return 两次抓取之间的间隔,单位毫秒
+ * @return the interval between the processing of two pages,
*/
public int getSleepTime() {
return sleepTime;
}
/**
- * 获取重新下载的次数,默认为0
+ * Get retry times when download fail, 0 by default.
*
- * @return 重新下载的次数
+ * @return retry times when download fail
*/
public int getRetryTimes() {
return retryTimes;
}
/**
- * 设置获取重新下载的次数,默认为0
+ * Set retry times when download fail, 0 by default.
*
* @return this
*/
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java
index e23a8e70..ade2194c 100644
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java
@@ -18,25 +18,30 @@ import java.util.concurrent.ExecutorService;
import java.util.concurrent.atomic.AtomicInteger;
/**
- *
- * webmagic爬虫的入口类。
- *
- * 示例:
- * 定义一个最简单的爬虫:
- * Spider.create(new SimplePageProcessor("http://my.oschina.net/", "http://my.oschina.net/*blog/*")).run();
- *
- * 使用FilePipeline保存结果到文件:
- * Spider.create(new SimplePageProcessor("http://my.oschina.net/", "http://my.oschina.net/*blog/*"))
- * .pipeline(new FilePipeline("/data/temp/webmagic/")).run();
- *
- * 使用FileCacheQueueScheduler缓存URL,关闭爬虫后下次自动从停止的页面继续抓取:
- * Spider.create(new SimplePageProcessor("http://my.oschina.net/", "http://my.oschina.net/*blog/*"))
- * .scheduler(new FileCacheQueueScheduler("/data/temp/webmagic/cache/")).run();
- *
+ * Entrance of a crawler.
+ * A spider contains four modules: Downloader, Scheduler, PageProcessor and Pipeline.
+ * Every module is a field of Spider.
+ * The modules are defined in interface.
+ * You can customize a spider with various implementations of them.
+ * Examples:
+ *
+ * A simple crawler:
+ * Spider.create(new SimplePageProcessor("http://my.oschina.net/", "http://my.oschina.net/*blog/*")).run();
+ *
+ * Store results to files by FilePipeline:
+ * Spider.create(new SimplePageProcessor("http://my.oschina.net/", "http://my.oschina.net/*blog/*"))
+ * .pipeline(new FilePipeline("/data/temp/webmagic/")).run();
+ *
+ * Use FileCacheQueueScheduler to store urls and cursor in files, so that a Spider can resume the status when shutdown.
+ * Spider.create(new SimplePageProcessor("http://my.oschina.net/", "http://my.oschina.net/*blog/*"))
+ * .scheduler(new FileCacheQueueScheduler("/data/temp/webmagic/cache/")).run();
*
* @author code4crafter@gmail.com
- * Date: 13-4-21
- * Time: 上午6:53
+ * @see Downloader
+ * @see Scheduler
+ * @see PageProcessor
+ * @see Pipeline
+ * @since 0.1.0
*/
public class Spider implements Runnable, Task {
@@ -222,11 +227,12 @@ public class Spider implements Runnable, Task {
/**
* 用某些特定URL进行爬虫测试
+ *
* @param urls 要抓取的url
*/
- public void test(String... urls){
+ public void test(String... urls) {
checkComponent();
- if (urls.length>0){
+ if (urls.length > 0) {
for (String url : urls) {
processRequest(new Request(url));
}
@@ -241,7 +247,7 @@ public class Spider implements Runnable, Task {
}
pageProcessor.process(page);
addRequest(page);
- if (!page.getResultItems().isSkip()){
+ if (!page.getResultItems().isSkip()) {
for (Pipeline pipeline : pipelines) {
pipeline.process(page.getResultItems(), this);
}
@@ -298,8 +304,8 @@ public class Spider implements Runnable, Task {
return this;
}
- public Spider clearPipeline(){
- pipelines=new ArrayList();
+ public Spider clearPipeline() {
+ pipelines = new ArrayList();
return this;
}
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java
index d555c5ed..fd680219 100644
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java
@@ -38,7 +38,7 @@ public class HttpClientDownloader implements Downloader {
* 直接下载页面的简便方法
*
* @param url
- * @return
+ * @return html
*/
public Html download(String url) {
Page page = download(new Request(url), null);
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/package.html b/webmagic-core/src/main/java/us/codecraft/webmagic/package.html
index 05328dcb..491afd93 100644
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/package.html
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/package.html
@@ -2,9 +2,6 @@
Main class "Spider" and models.
-
-
-包括webmagic入口类Spider和一些数据传递的实体类。