From 5f1f4cbc4625c87ecbef5a60e957a05a7b4b7742 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Sat, 17 Aug 2013 20:41:29 +0800 Subject: [PATCH] update comments --- .../main/java/us/codecraft/webmagic/Page.java | 66 ++++++-------- .../java/us/codecraft/webmagic/Request.java | 55 ++++-------- .../us/codecraft/webmagic/ResultItems.java | 23 +++-- .../main/java/us/codecraft/webmagic/Site.java | 86 ++++++++++--------- .../java/us/codecraft/webmagic/Spider.java | 50 ++++++----- .../downloader/HttpClientDownloader.java | 2 +- .../java/us/codecraft/webmagic/package.html | 3 - .../webmagic/utils}/Experimental.java | 2 +- .../us/codecraft/webmagic/MultiPageModel.java | 2 +- .../webmagic/downloader/FileCache.java | 2 +- .../us/codecraft/webmagic/model/HasKey.java | 2 +- .../model/annotation/ComboExtract.java | 7 +- .../webmagic/model/annotation/ExtractBy.java | 10 ++- .../webmagic/pipeline/MultiPagePipeline.java | 2 +- .../scheduler/FileCacheQueueScheduler.java | 2 +- 15 files changed, 152 insertions(+), 162 deletions(-) rename {webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation => webmagic-core/src/main/java/us/codecraft/webmagic/utils}/Experimental.java (71%) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java index a894269b..2516dd10 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java @@ -8,30 +8,19 @@ import java.util.ArrayList; import java.util.List; /** - *
- * Page保存了上一次抓取的结果,并可定义待抓取的链接内容。
  *
- *     主要方法:
- *     {@link #getUrl()} 获取页面的Url
- *     {@link #getHtml()}  获取页面的html内容
- *     {@link #putField(String, Object)}  保存抽取的结果
- *     {@link #getResultItems()} 获取抽取的结果,在 {@link us.codecraft.webmagic.pipeline.Pipeline} 中调用
- *     {@link #addTargetRequests(java.util.List)} {@link #addTargetRequest(String)} 添加待抓取的链接
- *
- * 
- *
- * Store extracted result and urls to be crawled.
- *
- *     Main method:
- *     {@link #getUrl()} get url of current page
- *     {@link #getHtml()}  get content of current page
- *     {@link #putField(String, Object)}  save extracted result
- *     {@link #getResultItems()} get extract results to be used in {@link us.codecraft.webmagic.pipeline.Pipeline}
- *     {@link #addTargetRequests(java.util.List)} {@link #addTargetRequest(String)} add urls to crawl
- *
- * 
+ * Object storing extracted result and urls to be crawled.
+ * Main method:
+ * {@link #getUrl()} get url of current page
+ * {@link #getHtml()} get content of current page
+ * {@link #putField(String, Object)} save extracted result
+ * {@link #getResultItems()} get extract results to be used in {@link us.codecraft.webmagic.pipeline.Pipeline}
+ * {@link #addTargetRequests(java.util.List)} {@link #addTargetRequest(String)} add urls to crawl
* * @author code4crafter@gmail.com
+ * @since 0.1.0 + * @see us.codecraft.webmagic.downloader.Downloader + * @see us.codecraft.webmagic.processor.PageProcessor */ public class Page { @@ -55,19 +44,19 @@ public class Page { } /** + * store extract results * - * - * @param key 结果的key - * @param field 结果的value + * @param key + * @param field */ public void putField(String key, Object field) { resultItems.put(key, field); } /** - * 获取页面的html内容 + * get html content of page * - * @return html 页面的html内容 + * @return html */ public Selectable getHtml() { return html; @@ -82,9 +71,9 @@ public class Page { } /** - * 添加待抓取的链接 + * add urls to crawl * - * @param requests 待抓取的链接 + * @param requests */ public void addTargetRequests(List requests) { synchronized (targetRequests) { @@ -99,9 +88,9 @@ public class Page { } /** - * 添加待抓取的链接 + * add url to crawl * - * @param requestString 待抓取的链接 + * @param requestString */ public void addTargetRequest(String requestString) { if (StringUtils.isBlank(requestString) || requestString.equals("#")) { @@ -114,9 +103,9 @@ public class Page { } /** - * 添加待抓取的页面,在需要传递附加信息时使用 + * add requests to crawl * - * @param request 待抓取的页面 + * @param request */ public void addTargetRequest(Request request) { synchronized (targetRequests) { @@ -125,27 +114,22 @@ public class Page { } /** - * 获取页面的Url + * get url of current page * - * @return url 当前页面的url,可用于抽取 + * @return url of current page */ public Selectable getUrl() { return url; } - /** - * 设置url - * - * @param url - */ public void setUrl(Selectable url) { this.url = url; } /** - * 获取抓取请求 + * get request of current page * - * @return request 抓取请求 + * @return request */ public Request getRequest() { return request; diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java index b9b8ddf6..fd7f60c9 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java @@ -1,33 +1,17 @@ package us.codecraft.webmagic; +import us.codecraft.webmagic.utils.Experimental; + import java.io.Serializable; import java.util.HashMap; import java.util.Map; /** - *
- * Request对象封装了待抓取的url信息。
- * 在PageProcessor中,Request对象可以通过{@link us.codecraft.webmagic.Page#getRequest()} 获取。
- *
- * Request对象包含一个extra属性,可以写入一些必须的上下文,这个特性在某些场合会有用。
- *
- *      Example:
- *          抓取${linktext}时,希望提取链接link,并保存linktext的信息。
- *      在上一个页面:
- *      public void process(Page page){
- *          Request request = new Request(link,linktext);
- *          page.addTargetRequest(request)
- *      }
- *      在下一个页面:
- *      public void process(Page page){
- *          String linktext =  (String)page.getRequest().getExtra()[0];
- *      }
- * 
- *
+ * Object contains url to crawl.
+ * It contains some additional information.
* * @author code4crafter@gmail.com
- * Date: 13-4-21 - * Time: 上午11:37 + * @since 0.1.0 */ public class Request implements Serializable { @@ -36,20 +20,22 @@ public class Request implements Serializable { private String url; /** - * 额外参数,可以保存一些需要的上下文信息 + * Store additional information in extras. */ private Map extras; + /** + * Priority of the request.
+ * The bigger will be processed earlier.
+ * Need a scheduler supporting priority.
+ * But no scheduler in webmagic supporting priority now (: + */ + @Experimental private double priority; public Request() { } - /** - * 构建一个request对象 - * - * @param url 必须参数,待抓取的url - */ public Request(String url) { this.url = url; } @@ -59,12 +45,14 @@ public class Request implements Serializable { } /** - * 设置优先级,用于URL队列排序
- * 需扩展Scheduler
- * 目前还没有对应支持优先级的Scheduler实现 =。=
- * @param priority 优先级,越大则越靠前 + * Set the priority of request for sorting.
+ * Need a scheduler supporting priority.
+ * But no scheduler in webmagic supporting priority now (: + * + * @param priority * @return this */ + @Experimental public Request setPriority(double priority) { this.priority = priority; return this; @@ -85,11 +73,6 @@ public class Request implements Serializable { return this; } - /** - * 获取待抓取的url - * - * @return url 待抓取的url - */ public String getUrl() { return url; } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/ResultItems.java b/webmagic-core/src/main/java/us/codecraft/webmagic/ResultItems.java index 7a8e5c39..e0552709 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/ResultItems.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/ResultItems.java @@ -4,10 +4,13 @@ import java.util.HashMap; import java.util.Map; /** - * 保存抽取结果的类,由PageProcessor处理得到,传递给{@link us.codecraft.webmagic.pipeline.Pipeline}进行持久化。
+ * Object contains extract results.
+ * It is contained in Page and will be processed in pipeline. + * * @author code4crafter@gmail.com
- * Date: 13-7-25
- * Time: 下午12:20
+ * @since 0.1.0 + * @see Page + * @see us.codecraft.webmagic.pipeline.Pipeline */ public class ResultItems { @@ -25,7 +28,7 @@ public class ResultItems { return (T) fields.get(key); } - public Map getAll() { + public Map getAll() { return fields; } @@ -44,8 +47,10 @@ public class ResultItems { } /** - * 是否忽略这个页面,用于pipeline来判断是否对这个页面进行处理 - * @return 是否忽略 true 忽略 + * Whether to skip the result.
+ * Result which is skipped will not be processed by Pipeline. + * + * @return whether to skip the result */ public boolean isSkip() { return skip; @@ -53,8 +58,10 @@ public class ResultItems { /** - * 设置是否忽略这个页面,用于pipeline来判断是否对这个页面进行处理 - * @param skip + * Set whether to skip the result.
+ * Result which is skipped will not be processed by Pipeline. + * + * @param skip whether to skip the result * @return this */ public ResultItems setSkip(boolean skip) { diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java index 9ab97fe8..443f2bba 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java @@ -5,12 +5,11 @@ import us.codecraft.webmagic.utils.UrlUtils; import java.util.*; /** - * Site定义一个待抓取的站点的各种信息。
- * 这个类的所有getter方法,一般都只会被爬虫框架内部进行调用。
+ * Object contains setting for crawler.
* * @author code4crafter@gmail.com
- * Date: 13-4-21 - * Time: 下午12:13 + * @since 0.1.0 + * @see us.codecraft.webmagic.processor.PageProcessor */ public class Site { @@ -22,6 +21,9 @@ public class Site { private String charset; + /** + * startUrls is the urls the crawler to start with. + */ private List startUrls = new ArrayList(); private int sleepTime = 3000; @@ -37,19 +39,19 @@ public class Site { } /** - * 创建一个Site对象,等价于new Site() + * new a Site * - * @return 新建的对象 + * @return new site */ public static Site me() { return new Site(); } /** - * 为这个站点添加一个cookie,可用于抓取某些需要登录访问的站点。这个cookie的域名与{@link #getDomain()}是一致的 + * Add a cookie with domain {@link #getDomain()} * - * @param name cookie的名称 - * @param value cookie的值 + * @param name + * @param value * @return this */ public Site addCookie(String name, String value) { @@ -58,7 +60,7 @@ public class Site { } /** - * 为这个站点设置user-agent,很多网站都对user-agent进行了限制,不设置此选项可能会得到期望之外的结果。 + * set user agent * * @param userAgent userAgent * @return this @@ -69,27 +71,27 @@ public class Site { } /** - * 获取已经设置的所有cookie + * get cookies * - * @return 已经设置的所有cookie + * @return get cookies */ public Map getCookies() { return cookies; } /** - * 获取已设置的user-agent + * get user agent * - * @return 已设置的user-agent + * @return user agent */ public String getUserAgent() { return userAgent; } /** - * 获取已设置的domain + * get domain * - * @return 已设置的domain + * @return get domain */ public String getDomain() { if (domain == null) { @@ -101,10 +103,9 @@ public class Site { } /** - * 设置这个站点所在域名,必须项。
- * 目前不支持多个域名的抓取。抓取多个域名请新建一个Spider。 + * set the domain of site. * - * @param domain 爬虫会抓取的域名 + * @param domain * @return this */ public Site setDomain(String domain) { @@ -113,10 +114,10 @@ public class Site { } /** - * 设置页面编码,若不设置则自动根据Html meta信息获取。
- * 一般无需设置encoding,如果发现下载的结果是乱码,则可以设置此项。
+ * Set charset of page manually.
+ * When charset is not set or set to null, it can be auto detected by Http header. * - * @param charset 编码格式,主要是"utf-8"、"gbk"两种 + * @param charset * @return this */ public Site setCharset(String charset) { @@ -125,20 +126,21 @@ public class Site { } /** - * 获取已设置的编码 + * get charset set manually * - * @return 已设置的domain + * @return charset */ public String getCharset() { return charset; } /** - * 设置可接受的http状态码,仅当状态码在这个集合中时,才会读取页面内容。
- * 默认为200,正常情况下,无须设置此项。
- * 某些站点会错误的返回状态码,此时可以对这个选项进行设置。
+ * Set acceptStatCode.
+ * When status code of http response is in acceptStatCodes, it will be processed.
+ * {200} by default.
+ * It is not necessarily to be set.
* - * @param acceptStatCode 可接受的状态码 + * @param acceptStatCode * @return this */ public Site setAcceptStatCode(Set acceptStatCode) { @@ -147,27 +149,27 @@ public class Site { } /** - * 获取可接受的状态码 + * get acceptStatCode * - * @return 可接受的状态码 + * @return acceptStatCode */ public Set getAcceptStatCode() { return acceptStatCode; } /** - * 获取初始页面的地址列表 + * get start urls * - * @return 初始页面的地址列表 + * @return start urls */ public List getStartUrls() { return startUrls; } /** - * 增加初始页面的地址,可反复调用此方法增加多个初始地址。 + * Add a url to start url.
* - * @param startUrl 初始页面的地址 + * @param startUrl * @return this */ public Site addStartUrl(String startUrl) { @@ -176,9 +178,10 @@ public class Site { } /** - * 设置两次抓取之间的间隔,避免对目标站点压力过大(或者避免被防火墙屏蔽...)。 + * Set the interval between the processing of two pages.
+ * Time unit is micro seconds.
* - * @param sleepTime 单位毫秒 + * @param sleepTime * @return this */ public Site setSleepTime(int sleepTime) { @@ -187,25 +190,26 @@ public class Site { } /** - * 获取两次抓取之间的间隔 + * Get the interval between the processing of two pages.
+ * Time unit is micro seconds.
* - * @return 两次抓取之间的间隔,单位毫秒 + * @return the interval between the processing of two pages, */ public int getSleepTime() { return sleepTime; } /** - * 获取重新下载的次数,默认为0 + * Get retry times when download fail, 0 by default.
* - * @return 重新下载的次数 + * @return retry times when download fail */ public int getRetryTimes() { return retryTimes; } /** - * 设置获取重新下载的次数,默认为0 + * Set retry times when download fail, 0 by default.
* * @return this */ diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java index e23a8e70..ade2194c 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java @@ -18,25 +18,30 @@ import java.util.concurrent.ExecutorService; import java.util.concurrent.atomic.AtomicInteger; /** - *
- * webmagic爬虫的入口类。
- *
- * 示例:
- * 定义一个最简单的爬虫:
- *      Spider.create(new SimplePageProcessor("http://my.oschina.net/", "http://my.oschina.net/*blog/*")).run();
- *
- * 使用FilePipeline保存结果到文件:
- *      Spider.create(new SimplePageProcessor("http://my.oschina.net/", "http://my.oschina.net/*blog/*"))
- *          .pipeline(new FilePipeline("/data/temp/webmagic/")).run();
- *
- * 使用FileCacheQueueScheduler缓存URL,关闭爬虫后下次自动从停止的页面继续抓取:
- *      Spider.create(new SimplePageProcessor("http://my.oschina.net/", "http://my.oschina.net/*blog/*"))
- *          .scheduler(new FileCacheQueueScheduler("/data/temp/webmagic/cache/")).run();
- * 
+ * Entrance of a crawler.
+ * A spider contains four modules: Downloader, Scheduler, PageProcessor and Pipeline.
+ * Every module is a field of Spider.
+ * The modules are defined in interface.
+ * You can customize a spider with various implementations of them.
+ * Examples:
+ *
+ * A simple crawler:
+ * Spider.create(new SimplePageProcessor("http://my.oschina.net/", "http://my.oschina.net/*blog/*")).run();
+ *
+ * Store results to files by FilePipeline:
+ * Spider.create(new SimplePageProcessor("http://my.oschina.net/", "http://my.oschina.net/*blog/*"))
+ * .pipeline(new FilePipeline("/data/temp/webmagic/")).run();
+ *
+ * Use FileCacheQueueScheduler to store urls and cursor in files, so that a Spider can resume the status when shutdown.
+ * Spider.create(new SimplePageProcessor("http://my.oschina.net/", "http://my.oschina.net/*blog/*"))
+ * .scheduler(new FileCacheQueueScheduler("/data/temp/webmagic/cache/")).run();
* * @author code4crafter@gmail.com
- * Date: 13-4-21 - * Time: 上午6:53 + * @see Downloader + * @see Scheduler + * @see PageProcessor + * @see Pipeline + * @since 0.1.0 */ public class Spider implements Runnable, Task { @@ -222,11 +227,12 @@ public class Spider implements Runnable, Task { /** * 用某些特定URL进行爬虫测试 + * * @param urls 要抓取的url */ - public void test(String... urls){ + public void test(String... urls) { checkComponent(); - if (urls.length>0){ + if (urls.length > 0) { for (String url : urls) { processRequest(new Request(url)); } @@ -241,7 +247,7 @@ public class Spider implements Runnable, Task { } pageProcessor.process(page); addRequest(page); - if (!page.getResultItems().isSkip()){ + if (!page.getResultItems().isSkip()) { for (Pipeline pipeline : pipelines) { pipeline.process(page.getResultItems(), this); } @@ -298,8 +304,8 @@ public class Spider implements Runnable, Task { return this; } - public Spider clearPipeline(){ - pipelines=new ArrayList(); + public Spider clearPipeline() { + pipelines = new ArrayList(); return this; } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java index d555c5ed..fd680219 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java @@ -38,7 +38,7 @@ public class HttpClientDownloader implements Downloader { * 直接下载页面的简便方法 * * @param url - * @return + * @return html */ public Html download(String url) { Page page = download(new Request(url), null); diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/package.html b/webmagic-core/src/main/java/us/codecraft/webmagic/package.html index 05328dcb..491afd93 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/package.html +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/package.html @@ -2,9 +2,6 @@
Main class "Spider" and models. -
-
-包括webmagic入口类Spider和一些数据传递的实体类。
diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/Experimental.java b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/Experimental.java similarity index 71% rename from webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/Experimental.java rename to webmagic-core/src/main/java/us/codecraft/webmagic/utils/Experimental.java index f619d125..265f869f 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/Experimental.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/Experimental.java @@ -1,4 +1,4 @@ -package us.codecraft.webmagic.model.annotation; +package us.codecraft.webmagic.utils; /** * @author code4crafter@gmail.com
diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/MultiPageModel.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/MultiPageModel.java index 88caf3ec..9190495c 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/MultiPageModel.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/MultiPageModel.java @@ -1,6 +1,6 @@ package us.codecraft.webmagic; -import us.codecraft.webmagic.model.annotation.Experimental; +import us.codecraft.webmagic.utils.Experimental; import java.util.Collection; diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/downloader/FileCache.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/downloader/FileCache.java index 163c75ba..154667c7 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/downloader/FileCache.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/downloader/FileCache.java @@ -4,7 +4,7 @@ import org.apache.commons.codec.digest.DigestUtils; import org.apache.commons.lang3.StringUtils; import org.apache.log4j.Logger; import us.codecraft.webmagic.*; -import us.codecraft.webmagic.model.annotation.Experimental; +import us.codecraft.webmagic.utils.Experimental; import us.codecraft.webmagic.pipeline.Pipeline; import us.codecraft.webmagic.processor.PageProcessor; import us.codecraft.webmagic.processor.SimplePageProcessor; diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/HasKey.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/HasKey.java index 3a8e6e2a..e068d04c 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/HasKey.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/HasKey.java @@ -1,6 +1,6 @@ package us.codecraft.webmagic.model; -import us.codecraft.webmagic.model.annotation.Experimental; +import us.codecraft.webmagic.utils.Experimental; /** * Interface to be implemented by page mode.
diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ComboExtract.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ComboExtract.java index 02fa25b4..5268a254 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ComboExtract.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ComboExtract.java @@ -21,7 +21,7 @@ public @interface ComboExtract { */ ExtractBy[] value(); - enum Op { + public static enum Op { /** * All extractors will be arranged as a pipeline.
* The next extractor uses the result of the previous as source. @@ -49,7 +49,10 @@ public @interface ComboExtract { */ boolean notNull() default false; - public enum Source { + /** + * types of source for extracting. + */ + public static enum Source { /** * extract from the content extracted by class extractor */ diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ExtractBy.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ExtractBy.java index 9e0ea18e..4bbebf68 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ExtractBy.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ExtractBy.java @@ -21,7 +21,10 @@ public @interface ExtractBy { */ String value(); - public enum Type {XPath, Regex, Css} + /** + * types of extractor expressions + */ + public static enum Type {XPath, Regex, Css} /** * Extractor type, support XPath, CSS Selector and regex. @@ -38,7 +41,10 @@ public @interface ExtractBy { */ boolean notNull() default false; - public enum Source { + /** + * types of source for extracting. + */ + public static enum Source { /** * extract from the content extracted by class extractor */ diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/MultiPagePipeline.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/MultiPagePipeline.java index 81c684b6..5806602c 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/MultiPagePipeline.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/MultiPagePipeline.java @@ -3,7 +3,7 @@ package us.codecraft.webmagic.pipeline; import us.codecraft.webmagic.MultiPageModel; import us.codecraft.webmagic.ResultItems; import us.codecraft.webmagic.Task; -import us.codecraft.webmagic.model.annotation.Experimental; +import us.codecraft.webmagic.utils.Experimental; import us.codecraft.webmagic.utils.DoubleKeyMap; import java.util.*; diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/FileCacheQueueScheduler.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/FileCacheQueueScheduler.java index 3f691cd2..b646b0f4 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/FileCacheQueueScheduler.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/FileCacheQueueScheduler.java @@ -16,7 +16,7 @@ import java.util.concurrent.atomic.AtomicBoolean; import java.util.concurrent.atomic.AtomicInteger; /** - * Store urls and cursor in files so that a Spider can resume the status when shutdown。
+ * Store urls and cursor in files so that a Spider can resume the status when shutdown.
* * @author code4crafter@gmail.com
* @since 0.2.0