From 96454fd74c5eeb90fb227e5ac79b84c47a99fef1 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Wed, 24 Jul 2013 18:26:54 +0800 Subject: [PATCH] update java doc --- .../src/main/java/us/codecraft/webmagic/Page.java | 4 ++-- .../webmagic/downloader/HttpClientDownloader.java | 6 +++--- .../webmagic/pipeline/ConsolePipeline.java | 1 + .../codecraft/webmagic/pipeline/FilePipeline.java | 8 ++++++++ .../us/codecraft/webmagic/pipeline/Pipeline.java | 1 + .../webmagic/processor/PageProcessor.java | 6 ++++-- .../webmagic/processor/SimplePageProcessor.java | 2 ++ .../schedular/FileCacheQueueScheduler.java | 4 +++- .../webmagic/schedular/QueueScheduler.java | 1 + .../us/codecraft/webmagic/schedular/Scheduler.java | 12 ++++++++++++ .../us/codecraft/webmagic/schedular/package.html | 2 +- .../codecraft/webmagic/selector/CssSelector.java | 1 + .../codecraft/webmagic/selector/RegexResult.java | 1 + .../us/codecraft/webmagic/selector/Selectable.java | 2 +- .../codecraft/webmagic/selector/XpathSelector.java | 9 +++++---- .../java/us/codecraft/webmagic/utils/UrlUtils.java | 14 ++++++++++---- .../us/codecraft/webmagic/utils/UrlUtilsTest.java | 11 +++++------ .../codecraft/webmagic/FreemarkerPipelineTest.java | 2 +- 18 files changed, 62 insertions(+), 25 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java index 5b1ceaf4..b2dd3db7 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java @@ -101,7 +101,7 @@ public class Page { if (StringUtils.isBlank(s) || s.equals("#") || s.startsWith("javascript:")) { break; } - s = UrlUtils.fixRelativeUrl(s, url.toString()); + s = UrlUtils.canonicalizeUrl(s, url.toString()); targetRequests.add(new Request(s)); } } @@ -116,7 +116,7 @@ public class Page { return; } synchronized (targetRequests) { - requestString = UrlUtils.fixRelativeUrl(requestString, url.toString()); + requestString = UrlUtils.canonicalizeUrl(requestString, url.toString()); targetRequests.add(new Request(requestString)); } } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java index e4ae0ff6..ac3ea0fb 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java @@ -58,7 +58,7 @@ public class HttpClientDownloader implements Downloader { //charset if (charset == null) { String value = httpResponse.getEntity().getContentType().getValue(); - charset = new PlainText(value).regex("charset=([^\\s]+)").toString(); + charset = UrlUtils.getCharset(value); } // handleGzip(httpResponse); @@ -82,8 +82,8 @@ public class HttpClientDownloader implements Downloader { Header ceheader = httpResponse.getEntity().getContentEncoding(); if (ceheader != null) { HeaderElement[] codecs = ceheader.getElements(); - for (int i = 0; i < codecs.length; i++) { - if (codecs[i].getName().equalsIgnoreCase("gzip")) { + for (HeaderElement codec : codecs) { + if (codec.getName().equalsIgnoreCase("gzip")) { httpResponse.setEntity( new GzipDecompressingEntity(httpResponse.getEntity())); } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/ConsolePipeline.java b/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/ConsolePipeline.java index 6c2abba8..dff2ded7 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/ConsolePipeline.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/ConsolePipeline.java @@ -7,6 +7,7 @@ import us.codecraft.webmagic.selector.Selectable; import java.util.Map; /** + * 命令行输出抽取结果。可用于测试。
* @author code4crafter@gmail.com
* Date: 13-4-21 * Time: 下午1:45 diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/FilePipeline.java b/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/FilePipeline.java index b079dcc4..e48e2bb4 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/FilePipeline.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/FilePipeline.java @@ -11,6 +11,7 @@ import java.io.IOException; import java.io.PrintWriter; /** + * 持久化到文件的接口。 * @author code4crafter@gmail.com
* Date: 13-4-21 * Time: 下午6:28 @@ -21,10 +22,17 @@ public class FilePipeline implements Pipeline { private Logger logger = Logger.getLogger(getClass()); + /** + * 新建一个FilePipeline,使用默认保存路径"/data/temp/webmagic/" + */ public FilePipeline() { } + /** + * 新建一个FilePipeline + * @param path 文件保存路径 + */ public FilePipeline(String path) { this.path = path; } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/Pipeline.java b/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/Pipeline.java index 1be447c4..408392d9 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/Pipeline.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/Pipeline.java @@ -4,6 +4,7 @@ import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Task; /** + * Pipeline是数据离线处理和持久化的接口。通过实现Pipeline以实现不同的持久化方式(例如保存到数据库)。 * @author code4crafter@gmail.com
* Date: 13-4-21 * Time: 下午1:39 diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/processor/PageProcessor.java b/webmagic-core/src/main/java/us/codecraft/webmagic/processor/PageProcessor.java index c36ae980..3963d080 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/processor/PageProcessor.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/processor/PageProcessor.java @@ -4,6 +4,8 @@ import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Site; /** + * 定制爬虫的核心接口。通过实现PageProcessor可以实现一个定制的爬虫。
+ * extends the class to implements various spiders.
* @author code4crafter@gmail.com
* Date: 13-4-21 * Time: 上午11:42 @@ -11,13 +13,13 @@ import us.codecraft.webmagic.Site; public interface PageProcessor { /** - * extends the class to implements variaty spiders + * 定义如何处理页面,包括链接提取、内容抽取等。 * @param page */ public void process(Page page); /** - * the site the processor for + * 定义任务一些配置信息,例如开始链接、抓取间隔、自定义cookie、自定义UA等。 * @return site */ public Site getSite(); diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/processor/SimplePageProcessor.java b/webmagic-core/src/main/java/us/codecraft/webmagic/processor/SimplePageProcessor.java index 0d524462..47d3748b 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/processor/SimplePageProcessor.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/processor/SimplePageProcessor.java @@ -7,6 +7,7 @@ import us.codecraft.webmagic.utils.UrlUtils; import java.util.List; /** + * 非常简单的抽取器。链接抽取使用定义的通配符,并保存抽取整个内容到content字段。
* @author code4crafter@gmail.com
* Date: 13-4-22 * Time: 下午9:15 @@ -22,6 +23,7 @@ public class SimplePageProcessor implements PageProcessor { public SimplePageProcessor(String startUrl, String urlPattern) { this.site = Site.me().addStartUrl(startUrl). setDomain(UrlUtils.getDomain(startUrl)).setUserAgent(UA); + //compile "*" expression to regex this.urlPattern = "("+urlPattern.replace(".","\\.").replace("*","[^\"'#]*")+")"; } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/schedular/FileCacheQueueScheduler.java b/webmagic-core/src/main/java/us/codecraft/webmagic/schedular/FileCacheQueueScheduler.java index 1f5298a0..77a6c0b3 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/schedular/FileCacheQueueScheduler.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/schedular/FileCacheQueueScheduler.java @@ -16,6 +16,7 @@ import java.util.concurrent.atomic.AtomicBoolean; import java.util.concurrent.atomic.AtomicInteger; /** + * 磁盘文件实现的安全Scheduler,可以保证在长时间执行的任务中断后,下次启动从中断位置重新开始。
* @author code4crafter@gmail.com
* Date: 13-4-21 * Time: 下午1:13 @@ -91,6 +92,7 @@ public class FileCacheQueueScheduler implements Scheduler { readCursorFile(); readUrlFile(); } catch (IOException e) { + logger.error("init file error",e); } } @@ -109,7 +111,7 @@ public class FileCacheQueueScheduler implements Scheduler { private void readCursorFile() throws IOException { BufferedReader fileCursorReader = new BufferedReader(new FileReader(getFileName(fileCursor))); - String line = null; + String line; //read the last number while ((line = fileCursorReader.readLine()) != null) { cursor = new AtomicInteger(NumberUtils.toInt(line)); diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/schedular/QueueScheduler.java b/webmagic-core/src/main/java/us/codecraft/webmagic/schedular/QueueScheduler.java index 69768859..613e4062 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/schedular/QueueScheduler.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/schedular/QueueScheduler.java @@ -10,6 +10,7 @@ import java.util.concurrent.BlockingQueue; import java.util.concurrent.LinkedBlockingQueue; /** + * 内存队列实现的线程安全Scheduler。
* @author code4crafter@gmail.com
* Date: 13-4-21 * Time: 下午1:13 diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/schedular/Scheduler.java b/webmagic-core/src/main/java/us/codecraft/webmagic/schedular/Scheduler.java index 7e021327..bf440baf 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/schedular/Scheduler.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/schedular/Scheduler.java @@ -4,14 +4,26 @@ import us.codecraft.webmagic.Request; import us.codecraft.webmagic.Task; /** + * 包含url管理和调度的接口。包括url抓取队列,url去重等功能。
+ * Scheduler的接口包含一个Task参数,该参数是为单Scheduler多Task预留的(Spider就是一个Task)。
* @author code4crafter@gmail.com
* Date: 13-4-21 * Time: 下午1:12 */ public interface Scheduler { + /** + * 加入一个待抓取的链接 + * @param request 待抓取的链接 + * @param task 定义的任务,以满足单Scheduler多Task的情况 + */ public void push(Request request,Task task); + /** + * 返回下一个要抓取的链接 + * @param task 定义的任务,以满足单Scheduler多Task的情况 + * @return + */ public Request poll(Task task); } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/schedular/package.html b/webmagic-core/src/main/java/us/codecraft/webmagic/schedular/package.html index 0e35610f..7887dd53 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/schedular/package.html +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/schedular/package.html @@ -1,5 +1,5 @@ -包含url管理和调度的接口Schedular及它的几个实现类。 +包含url管理和调度的接口Scheduler及它的几个实现类。 diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/CssSelector.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/CssSelector.java index c2d654af..10dfb623 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/CssSelector.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/CssSelector.java @@ -10,6 +10,7 @@ import java.util.ArrayList; import java.util.List; /** + * css风格的选择器。包装了Jsoup。
* @author code4crafter@gmail.com
* Date: 13-4-21 * Time: 上午9:39 diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/RegexResult.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/RegexResult.java index 9f4e2f06..04467bcc 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/RegexResult.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/RegexResult.java @@ -1,6 +1,7 @@ package us.codecraft.webmagic.selector; /** + * 封装正则表达式抽取接口的类。
* @author code4crafter@gmail.com
* Date: 13-4-21 * Time: 上午7:39 diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectable.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectable.java index f4aa9a53..932115cd 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectable.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectable.java @@ -18,7 +18,7 @@ public interface Selectable { public Selectable xpath(String xpath); /** - * select list with jquery selector + * select list with css selector * * @param * @return diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/XpathSelector.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/XpathSelector.java index c2b408eb..02afe291 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/XpathSelector.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/XpathSelector.java @@ -6,6 +6,7 @@ import java.util.ArrayList; import java.util.List; /** + * xpath的选择器。包装了HtmlCleaner。
* @author code4crafter@gmail.com
* Date: 13-4-21 * Time: 上午9:39 @@ -52,12 +53,12 @@ public class XpathSelector implements Selector { try { Object[] objects = tagNode.evaluateXPath(xpathStr); if (objects != null && objects.length >= 1) { - for (int i = 0; i < objects.length; i++) { - if (objects[i] instanceof TagNode) { - TagNode tagNode1 = (TagNode) objects[i]; + for (Object object : objects) { + if (object instanceof TagNode) { + TagNode tagNode1 = (TagNode) object; results.add(htmlCleaner.getInnerHtml(tagNode1)); } else { - results.add(objects[i].toString()); + results.add(object.toString()); } } } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/utils/UrlUtils.java b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/UrlUtils.java index 667aaf25..0b7201d6 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/utils/UrlUtils.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/UrlUtils.java @@ -14,7 +14,13 @@ public class UrlUtils { private static Pattern relativePathPattern = Pattern.compile("^([\\.]+)/"); - public static String fixRelativeUrl(String url, String refer) { + /** + * 将url想对地址转化为绝对地址 + * @param url url地址 + * @param refer url地址来自哪个页面 + * @return + */ + public static String canonicalizeUrl(String url, String refer) { if (StringUtils.isBlank(url) || StringUtils.isBlank(refer)) { return url; } @@ -62,12 +68,12 @@ public class UrlUtils { private static Pattern patternForProtocal = Pattern.compile("[\\w]+://"); - public static String removeProtocal(String url) { + public static String removeProtocol(String url) { return patternForProtocal.matcher(url).replaceAll(""); } public static String getDomain(String url) { - String domain = removeProtocal(url); + String domain = removeProtocol(url); int i = StringUtils.indexOf(domain, "/", 1); if (i > 0) { domain = StringUtils.substring(domain, 0, i); @@ -84,7 +90,7 @@ public class UrlUtils { while (matcher.find()) { stringBuilder.append(StringUtils.substring(html, lastEnd, matcher.start())); stringBuilder.append(matcher.group(1)); - stringBuilder.append("\"" + fixRelativeUrl(matcher.group(2), url) + "\""); + stringBuilder.append("\"").append(canonicalizeUrl(matcher.group(2), url)).append("\""); lastEnd = matcher.end(); } stringBuilder.append(StringUtils.substring(html, lastEnd)); diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/utils/UrlUtilsTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/utils/UrlUtilsTest.java index 4cfdc046..cd55b2c7 100644 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/utils/UrlUtilsTest.java +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/utils/UrlUtilsTest.java @@ -12,18 +12,18 @@ public class UrlUtilsTest { @Test public void testFixRelativeUrl() { - String fixrelativeurl = UrlUtils.fixRelativeUrl("aa", "http://www.dianping.com/sh/ss/com"); + String fixrelativeurl = UrlUtils.canonicalizeUrl("aa", "http://www.dianping.com/sh/ss/com"); System.out.println("fix: " + fixrelativeurl); Assert.assertEquals("http://www.dianping.com/sh/ss/aa", fixrelativeurl); - fixrelativeurl = UrlUtils.fixRelativeUrl("../aa", "http://www.dianping.com/sh/ss/com"); + fixrelativeurl = UrlUtils.canonicalizeUrl("../aa", "http://www.dianping.com/sh/ss/com"); Assert.assertEquals("http://www.dianping.com/sh/aa", fixrelativeurl); - fixrelativeurl = UrlUtils.fixRelativeUrl("..../aa", "http://www.dianping.com/sh/ss/com"); + fixrelativeurl = UrlUtils.canonicalizeUrl("..../aa", "http://www.dianping.com/sh/ss/com"); Assert.assertEquals("http://www.dianping.com/aa", fixrelativeurl); - fixrelativeurl = UrlUtils.fixRelativeUrl(".../aa", "http://www.dianping.com/sh/ss/com"); + fixrelativeurl = UrlUtils.canonicalizeUrl(".../aa", "http://www.dianping.com/sh/ss/com"); Assert.assertEquals("http://www.dianping.com/aa", fixrelativeurl); - fixrelativeurl = UrlUtils.fixRelativeUrl("..aa", "http://www.dianping.com/sh/ss/com"); + fixrelativeurl = UrlUtils.canonicalizeUrl("..aa", "http://www.dianping.com/sh/ss/com"); Assert.assertEquals("http://www.dianping.com/sh/ss/..aa", fixrelativeurl); // fixrelativeurl = fixrelativeurl("/aa", "http://www.dianping.com"); // System.out.println("fix: " + fixrelativeurl); @@ -628,7 +628,6 @@ public class UrlUtilsTest { "\t\t\t\n" + "\n"; String newHtml = UrlUtils.fixAllRelativeHrefs(html, "http://www.huxiu.com/"); - String text = "订阅虎嗅"; Assert.assertTrue(html.contains("