From 90bbe9b9514246215217eaee4c59bd7870bf716d Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Sat, 17 Aug 2013 23:24:04 +0800 Subject: [PATCH] webmagic-core --- .../main/java/us/codecraft/webmagic/Page.java | 10 ++++----- .../webmagic/downloader/Downloader.java | 17 +++++++------- .../downloader/HttpClientDownloader.java | 11 +++++----- .../webmagic/downloader/HttpClientPool.java | 3 +-- .../webmagic/downloader/package.html | 2 +- .../webmagic/pipeline/ConsolePipeline.java | 6 ++--- .../webmagic/pipeline/FilePipeline.java | 16 +++++--------- .../codecraft/webmagic/pipeline/Pipeline.java | 17 ++++++++++---- .../codecraft/webmagic/pipeline/package.html | 2 +- .../webmagic/processor/PageProcessor.java | 22 ++++++++++++++----- .../processor/SimplePageProcessor.java | 16 +++++++------- .../codecraft/webmagic/processor/package.html | 2 +- .../webmagic/scheduler/QueueScheduler.java | 17 ++++++++------ .../webmagic/scheduler/Scheduler.java | 20 ++++++++++------- .../codecraft/webmagic/scheduler/package.html | 2 +- .../webmagic/selector/AndSelector.java | 2 ++ .../webmagic/selector/CssSelector.java | 6 ++--- .../webmagic/selector/OrSelector.java | 2 ++ .../codecraft/webmagic/selector/Selector.java | 4 ++++ zh_docs/us/codecraft/webmagic/Page-cmnt.xml | 2 +- 20 files changed, 104 insertions(+), 75 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java index 2516dd10..afdf2320 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java @@ -9,13 +9,13 @@ import java.util.List; /** * - * Object storing extracted result and urls to be crawled.
+ * Object storing extracted result and urls to fetch.
* Main method:
* {@link #getUrl()} get url of current page
* {@link #getHtml()} get content of current page
* {@link #putField(String, Object)} save extracted result
* {@link #getResultItems()} get extract results to be used in {@link us.codecraft.webmagic.pipeline.Pipeline}
- * {@link #addTargetRequests(java.util.List)} {@link #addTargetRequest(String)} add urls to crawl
+ * {@link #addTargetRequests(java.util.List)} {@link #addTargetRequest(String)} add urls to fetch
* * @author code4crafter@gmail.com
* @since 0.1.0 @@ -71,7 +71,7 @@ public class Page { } /** - * add urls to crawl + * add urls to fetch * * @param requests */ @@ -88,7 +88,7 @@ public class Page { } /** - * add url to crawl + * add url to fetch * * @param requestString */ @@ -103,7 +103,7 @@ public class Page { } /** - * add requests to crawl + * add requests to fetch * * @param request */ diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/Downloader.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/Downloader.java index 9a7f59a3..ec74950c 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/Downloader.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/Downloader.java @@ -5,16 +5,17 @@ import us.codecraft.webmagic.Request; import us.codecraft.webmagic.Task; /** - * Downloader是webmagic下载页面的接口。webmagic默认使用了HttpComponent作为下载器,一般情况,你无需自己实现这个接口。
+ * Downloader is the part that downloads web pages and store in Page object.
+ * Downloader has {@link #setThread(int)} method because downloader is always the bottleneck of a crawler, + * there are always some mechanisms such as pooling in downloader, and pool size is related to thread numbers. * * @author code4crafter@gmail.com
- * Date: 13-4-21 - * Time: 下午12:14 + * @since 0.1.0 */ public interface Downloader { /** - * 下载页面,并保存信息到Page对象中。 + * Downloads web pages and store in Page object. * * @param request * @param task @@ -23,10 +24,8 @@ public interface Downloader { public Page download(Request request, Task task); /** - * 设置线程数,多线程程序一般需要Downloader支持
- * 如果不考虑多线程的可以不实现这个方法
- * - * @param thread 线程数量 + * Tell the downloader how many threads the spider used. + * @param threadNum number of threads */ - public void setThread(int thread); + public void setThread(int threadNum); } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java index fd680219..75634104 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java @@ -4,6 +4,7 @@ import org.apache.commons.io.IOUtils; import org.apache.http.Header; import org.apache.http.HeaderElement; import org.apache.http.HttpResponse; +import org.apache.http.annotation.ThreadSafe; import org.apache.http.client.HttpClient; import org.apache.http.client.entity.GzipDecompressingEntity; import org.apache.http.client.methods.HttpGet; @@ -22,12 +23,12 @@ import java.util.Set; /** - * 封装了HttpClient的下载器。已实现指定次数重试、处理gzip、自定义UA/cookie等功能。
+ * The http downloader based on HttpClient. * * @author code4crafter@gmail.com
- * Date: 13-4-21 - * Time: 下午12:15 + * @since 0.1.0 */ +@ThreadSafe public class HttpClientDownloader implements Downloader { private Logger logger = Logger.getLogger(getClass()); @@ -35,14 +36,14 @@ public class HttpClientDownloader implements Downloader { private int poolSize = 1; /** - * 直接下载页面的简便方法 + * A simple method to download a url. * * @param url * @return html */ public Html download(String url) { Page page = download(new Request(url), null); - return (Html)page.getHtml(); + return (Html) page.getHtml(); } @Override diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientPool.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientPool.java index c6e26526..f2fffad2 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientPool.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientPool.java @@ -20,8 +20,7 @@ import java.util.Map; /** * @author code4crafter@gmail.com
- * Date: 13-4-21 - * Time: 下午12:29 + * @since 0.1.0 */ public class HttpClientPool { diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/package.html b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/package.html index cae5560e..719abd97 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/package.html +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/package.html @@ -1,5 +1,5 @@ -包含了页面下载的接口Downloader和实现类HttpClientDownloader,该实现类封装了HttpComponent库。 +Downloader is the part that downloads web pages and store in Page object. diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/ConsolePipeline.java b/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/ConsolePipeline.java index e1648fe7..888e4550 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/ConsolePipeline.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/ConsolePipeline.java @@ -6,11 +6,11 @@ import us.codecraft.webmagic.Task; import java.util.Map; /** - * 命令行输出抽取结果。可用于测试。
+ * Write results in console.
+ * Usually used in test. * * @author code4crafter@gmail.com
- * Date: 13-4-21 - * Time: 下午1:45 + * @since 0.1.0 */ public class ConsolePipeline implements Pipeline { diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/FilePipeline.java b/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/FilePipeline.java index 9c88ba9e..04709f2d 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/FilePipeline.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/FilePipeline.java @@ -1,6 +1,7 @@ package us.codecraft.webmagic.pipeline; import org.apache.commons.codec.digest.DigestUtils; +import org.apache.http.annotation.ThreadSafe; import org.apache.log4j.Logger; import us.codecraft.webmagic.ResultItems; import us.codecraft.webmagic.Task; @@ -12,28 +13,23 @@ import java.io.PrintWriter; import java.util.Map; /** - * 持久化到文件的接口。 + * Store results in files.
* * @author code4crafter@gmail.com
- * Date: 13-4-21 - * Time: 下午6:28 + * @since 0.1.0 */ -public class FilePipeline extends FilePersistentBase implements Pipeline { +@ThreadSafe +public class FilePipeline extends FilePersistentBase implements Pipeline { private Logger logger = Logger.getLogger(getClass()); /** - * 新建一个FilePipeline,使用默认保存路径"/data/webmagic/" + * create a FilePipeline with default path"/data/webmagic/" */ public FilePipeline() { setPath("/data/webmagic/"); } - /** - * 新建一个FilePipeline - * - * @param path 文件保存路径 - */ public FilePipeline(String path) { setPath(path); } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/Pipeline.java b/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/Pipeline.java index 595a8e87..af2ed8cd 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/Pipeline.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/Pipeline.java @@ -4,12 +4,21 @@ import us.codecraft.webmagic.ResultItems; import us.codecraft.webmagic.Task; /** - * Pipeline是数据离线处理和持久化的接口。通过实现Pipeline以实现不同的持久化方式(例如保存到数据库)。 + * Pipeline is the persistent and offline process part of crawler.
+ * The interface Pipeline can be implemented to customize ways of persistent. + * * @author code4crafter@gmail.com
- * Date: 13-4-21 - * Time: 下午1:39 + * @since 0.1.0 + * @see ConsolePipeline + * @see FilePipeline */ public interface Pipeline { - public void process(ResultItems resultItems,Task task); + /** + * Process extracted results. + * + * @param resultItems + * @param task + */ + public void process(ResultItems resultItems, Task task); } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/package.html b/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/package.html index 498183eb..6b0fcee2 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/package.html +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/package.html @@ -1,5 +1,5 @@ -包含了处理页面抽取结果的接口Pipeline和它的几个实现类。 +Pipeline is the persistent and offline process part of crawler. diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/processor/PageProcessor.java b/webmagic-core/src/main/java/us/codecraft/webmagic/processor/PageProcessor.java index 3963d080..e0bb6237 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/processor/PageProcessor.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/processor/PageProcessor.java @@ -4,23 +4,33 @@ import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Site; /** - * 定制爬虫的核心接口。通过实现PageProcessor可以实现一个定制的爬虫。
- * extends the class to implements various spiders.
+ * Interface to be implemented to customize a crawler.
+ *
+ * In PageProcessor, you can customize: + *

+ * start urls and other settings in {@link Site}
+ * how the urls to fetch are detected
+ * how the data are extracted and stored
+ * * @author code4crafter@gmail.com
- * Date: 13-4-21 - * Time: 上午11:42 + * @see Site + * @see Page + * @since 0.1.0 */ public interface PageProcessor { /** - * 定义如何处理页面,包括链接提取、内容抽取等。 + * process the page, extract urls to fetch, extract the data and store + * * @param page */ public void process(Page page); /** - * 定义任务一些配置信息,例如开始链接、抓取间隔、自定义cookie、自定义UA等。 + * get the site settings + * * @return site + * @see Site */ public Site getSite(); } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/processor/SimplePageProcessor.java b/webmagic-core/src/main/java/us/codecraft/webmagic/processor/SimplePageProcessor.java index 285a63d7..a0572a93 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/processor/SimplePageProcessor.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/processor/SimplePageProcessor.java @@ -7,10 +7,10 @@ import us.codecraft.webmagic.utils.UrlUtils; import java.util.List; /** - * 非常简单的抽取器。链接抽取使用定义的通配符,并保存抽取整个内容到content字段。
+ * A simple PageProcessor. + * * @author code4crafter@gmail.com
- * Date: 13-4-22 - * Time: 下午9:15 + * @since 0.1.0 */ public class SimplePageProcessor implements PageProcessor { @@ -22,25 +22,25 @@ public class SimplePageProcessor implements PageProcessor { this.site = Site.me().addStartUrl(startUrl). setDomain(UrlUtils.getDomain(startUrl)); //compile "*" expression to regex - this.urlPattern = "("+urlPattern.replace(".","\\.").replace("*","[^\"'#]*")+")"; + this.urlPattern = "(" + urlPattern.replace(".", "\\.").replace("*", "[^\"'#]*") + ")"; } @Override public void process(Page page) { List requests = page.getHtml().links().regex(urlPattern).all(); - //调用page.addTargetRequests()方法添加待抓取链接 + //add urls to fetch page.addTargetRequests(requests); - //xpath方式抽取 + //extract by XPath page.putField("title", page.getHtml().xpath("//title")); - //sc表示使用Readability技术抽取正文 page.putField("html", page.getHtml().toString()); + //extract by Readability page.putField("content", page.getHtml().smartContent()); } @Override public Site getSite() { - //定义抽取站点的相关参数 + //settings return site; } } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/processor/package.html b/webmagic-core/src/main/java/us/codecraft/webmagic/processor/package.html index 47274a1f..5ec7537b 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/processor/package.html +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/processor/package.html @@ -1,5 +1,5 @@ -包含了封装页面处理逻辑的接口PageProcessor和一个实现类SimplePageProcessor。实现PageProcessor即可定制一个自己的爬虫。 +PageProcessor custom part of a crawler for specific site. diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/QueueScheduler.java b/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/QueueScheduler.java index 723b5f93..b263f91c 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/QueueScheduler.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/QueueScheduler.java @@ -1,5 +1,6 @@ package us.codecraft.webmagic.scheduler; +import org.apache.http.annotation.ThreadSafe; import org.apache.log4j.Logger; import us.codecraft.webmagic.Request; import us.codecraft.webmagic.Task; @@ -10,11 +11,13 @@ import java.util.concurrent.BlockingQueue; import java.util.concurrent.LinkedBlockingQueue; /** - * 内存队列实现的线程安全Scheduler。
+ * Basic Scheduler implementation.
+ * Store urls to fetch in LinkedBlockingQueue and remove duplicate urls by HashMap. + * * @author code4crafter@gmail.com
- * Date: 13-4-21 - * Time: 下午1:13 + * @since 0.1.0 */ +@ThreadSafe public class QueueScheduler implements Scheduler { private Logger logger = Logger.getLogger(getClass()); @@ -24,11 +27,11 @@ public class QueueScheduler implements Scheduler { private Set urls = new HashSet(); @Override - public synchronized void push(Request request,Task task) { - if (logger.isDebugEnabled()){ - logger.debug("push to queue "+request.getUrl()); + public synchronized void push(Request request, Task task) { + if (logger.isDebugEnabled()) { + logger.debug("push to queue " + request.getUrl()); } - if (urls.add(request.getUrl())){ + if (urls.add(request.getUrl())) { queue.add(request); } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/Scheduler.java b/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/Scheduler.java index fc39b450..5b82dc75 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/Scheduler.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/Scheduler.java @@ -4,23 +4,27 @@ import us.codecraft.webmagic.Request; import us.codecraft.webmagic.Task; /** - * 包含url管理和调度的接口。包括url抓取队列,url去重等功能。
- * Scheduler的接口包含一个Task参数,该参数是为单Scheduler多Task预留的(Spider就是一个Task)。
+ * Scheduler is the part of url management.
+ * You can implement interface Scheduler to do: + * manage urls to fetch + * remove duplicate urls + * * @author code4crafter@gmail.com
- * Date: 13-4-21 - * Time: 下午1:12 + * @since 0.1.0 */ public interface Scheduler { /** - * 加入一个待抓取的链接 - * @param request 待抓取的链接 - * @param task 定义的任务,以满足单Scheduler多Task的情况 + * add a url to fetch + * + * @param request + * @param task */ - public void push(Request request,Task task); + public void push(Request request, Task task); /** * 返回下一个要抓取的链接 + * * @param task 定义的任务,以满足单Scheduler多Task的情况 * @return 下一个要抓取的链接 */ diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/package.html b/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/package.html index 7887dd53..e67edcc6 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/package.html +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/package.html @@ -1,5 +1,5 @@ -包含url管理和调度的接口Scheduler及它的几个实现类。 +Scheduler is the part of url management. diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/AndSelector.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/AndSelector.java index f13c6ed7..135442dc 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/AndSelector.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/AndSelector.java @@ -4,6 +4,8 @@ import java.util.ArrayList; import java.util.List; /** + * All selectors will be arranged as a pipeline.
+ * The next selector uses the result of the previous as source. * @author code4crafter@gmail.com
* @since 0.2.0 */ diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/CssSelector.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/CssSelector.java index 90a9d1d7..ab391863 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/CssSelector.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/CssSelector.java @@ -10,10 +10,10 @@ import java.util.ArrayList; import java.util.List; /** - * css风格的选择器。包装了Jsoup。
+ * CSS selector. Based on Jsoup. + * * @author code4crafter@gmail.com
- * Date: 13-4-21 - * Time: 上午9:39 + * @since 0.1.0 */ public class CssSelector implements Selector { diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/OrSelector.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/OrSelector.java index 4ece3222..fd16dcb0 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/OrSelector.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/OrSelector.java @@ -4,6 +4,8 @@ import java.util.ArrayList; import java.util.List; /** + * All extractors will do extracting separately,
+ * and the results of extractors will combined as the final result. * @author code4crafter@gmail.com
* @since 0.2.0 */ diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selector.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selector.java index 06756c44..3f4fe6bd 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selector.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selector.java @@ -4,13 +4,16 @@ import java.util.List; /** * Selector(extractor) for text.
+ * * @author code4crafter@gmail.com
+ * @since 0.1.0 */ public interface Selector { /** * Extract single result in text.
* If there are more than one result, only the first will be chosen. + * * @param text * @return result */ @@ -18,6 +21,7 @@ public interface Selector { /** * Extract all results in text.
+ * * @param text * @return results */ diff --git a/zh_docs/us/codecraft/webmagic/Page-cmnt.xml b/zh_docs/us/codecraft/webmagic/Page-cmnt.xml index 777f0b03..7ac44dc5 100644 --- a/zh_docs/us/codecraft/webmagic/Page-cmnt.xml +++ b/zh_docs/us/codecraft/webmagic/Page-cmnt.xml @@ -24,7 +24,7 @@ {@link #getHtml()} get content of current page {@link #putField(String, Object)} save extracted result {@link #getResultItems()} get extract results to be used in {@link us.codecraft.webmagic.pipeline.Pipeline} - {@link #addTargetRequests(java.util.List)} {@link #addTargetRequest(String)} add urls to crawl + {@link #addTargetRequests(java.util.List)} {@link #addTargetRequest(String)} add urls to fetch