diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/ReplaceSelector.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/ReplaceSelector.java
index 567dcda3..5b63a4b4 100644
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/ReplaceSelector.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/ReplaceSelector.java
@@ -6,7 +6,7 @@ import java.util.regex.Pattern;
import java.util.regex.PatternSyntaxException;
/**
- * Replace selector。
+ * Replace selector.
*
* @author code4crafter@gmail.com
* @since 0.1.0
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/XpathSelector.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/XpathSelector.java
index a2eab3d9..c0e428cb 100644
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/XpathSelector.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/XpathSelector.java
@@ -6,7 +6,7 @@ import java.util.ArrayList;
import java.util.List;
/**
- * XPath selector based on HtmlCleaner。
+ * XPath selector based on HtmlCleaner.
*
* @author code4crafter@gmail.com
* @since 0.1.0
diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/utils/UrlUtilsTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/utils/UrlUtilsTest.java
index 7ac7aa06..d1cbc21e 100644
--- a/webmagic-core/src/test/java/us/codecraft/webmagic/utils/UrlUtilsTest.java
+++ b/webmagic-core/src/test/java/us/codecraft/webmagic/utils/UrlUtilsTest.java
@@ -28,607 +28,6 @@ public class UrlUtilsTest {
fixrelativeurl = UrlUtils.canonicalizeUrl("../../aa", "http://www.dianping.com/sh/ss/com");
}
- @Test
- public void testFixRelativeHtml(){
- String html = "\n" +
- "\n" +
- "\n" +
- "
\n" +
- "\n" +
- "虎嗅网\n" +
- "\n" +
- "\n" +
- "\n" +
- "\n" +
- "\n" +
- "\n" +
- "\n" +
- "\n" +
- "\n" +
- "\n" +
- "\n" +
- "\n" +
- "\n" +
- "\n" +
- "\n" +
- "\n" +
- "\n" +
- "\n" +
- "\n" +
- "\t\t\t\n" +
- "\t\n" +
- "\n" +
- "

\n" +
- "\n" +
- "
\n" +
- "
\n" +
- "
\n" +
- "
订阅虎嗅\n" +
- "\n" +
- "
\n" +
- "\n" +
- "
\n" +
- "\n" +
- "
\n" +
- "
\n" +
- "\n" +
- "
\n" +
- "
\n" +
- "
\n" +
- "\n" +
- "\n" +
- "
\n" +
- "
\n" +
- "
\n" +
- "
\n" +
- "
\n" +
- "\n" +
- "\n" +
- "\n" +
- "
\n" +
- "
\n" +
- "
\n" +
- "
\n" +
- "
\n" +
- "
\n" +
- "
\n" +
- "
在公益产品开发上,互联网合作开放共享一面应得体现,商业竞争一面则应被冲淡
\n" +
- "

\n" +
- "
\n" +
- "
\n" +
- "
\n" +
- "
\n" +
- "
\n" +
- "
\n" +
- "
\n" +
- "
\n" +
- "
\n" +
- " \n" +
- " - 虎嗅 发表于 2013-04-22 18:42
\n" +
- " - 时代的问题时代自会判定,但难心想象,李经纬的晚年会多么地心灰意冷,抑郁难拔
\n" +
- "
\n" +
- " \n" +
- "
\n" +
- "
\n" +
- "
\n" +
- "
\n" +
- "
\n" +
- "
\n" +
- " \n" +
- " - 虎嗅 发表于 2013-04-22 18:09
\n" +
- " - 互联网公司在灾难面前可以放下竞争和利益,协作应对灾难,是大家都愿意看到的事情,也很感激响应虎嗅的每一家公司。
\n" +
- "
\n" +
- " \n" +
- "
\n" +
- "
\n" +
- "
\n" +
- "
\n" +
- "
\n" +
- "
\n" +
- " \n" +
- " - 虎嗅 发表于 2013-04-22 18:00
\n" +
- " - 这不是一本为发表所写的著作,而是作者应他兄弟的要求断断继继写下来的笔记。时间是从1973年到1974年作者逝世前为止。
\n" +
- "
\n" +
- " \n" +
- "
\n" +
- "
\n" +
- "
\n" +
- "
\n" +
- "
\n" +
- "
\n" +
- " \n" +
- " - 虎嗅 发表于 2013-04-22 17:18
\n" +
- " - 这部传说中的Kindle手机,媒体与分析师们都为它设定好了特征:1、低价。2、良好的内容体验。
\n" +
- "
\n" +
- " \n" +
- "
\n" +
- "
\n" +
- "
\n" +
- "
\n" +
- "
\n" +
- "
\n" +
- " \n" +
- " - 北京晨报 发表于 2013-04-22 07:23
\n" +
- " - 微信的工作原理是分组交换的业务模式。它经过压缩处理,占用的通道可宽可窄,信息可以一站站推送,有传输空间时再送出。在同等网络条件下,微信占用的网络资源要小得多
\n" +
- "
\n" +
- " \n" +
- "
\n" +
- "
\n" +
- "
\n" +
- "
\n" +
- "
\n" +
- "
\n" +
- " \n" +
- " - Guardian 发表于 2013-04-22 15:51
\n" +
- " - 国家提供了统一的规则。国家会制定实体政策和虚拟政策,这种二元性——网络空间实施一种战略,实体空间又部署另外一种战略——是可能的。
\n" +
- "
\n" +
- " \n" +
- "
\n" +
- "
\n" +
- "
\n" +
- "
\n" +
- "
\n" +
- "
\n" +
- " \n" +
- " - Hotashang 发表于 2013-04-22 14:23
\n" +
- " - 华尔街就是这样的一个地方,在长期投资预期与短视利润之间纠结。而作为公司的CEO,也会经常在这种长期投资预期与短视利润之间摇摆
\n" +
- "
\n" +
- " \n" +
- "
\n" +
- "
\n" +
- "
\n" +
- "
\n" +
- "
\n" +
- "
\n" +
- " \n" +
- " - Reputation Institute 发表于 2013-04-22 12:33
\n" +
- " - 宝马是唯一在所有七项评比标准中都进入前五名的公司。53%的首席高管认为公司声誉会提高销售和营收,63%的首席高管预期声誉管理在未来两到三年会成为公司的优先工作内容。
\n" +
- "
\n" +
- " \n" +
- "
\n" +
- "
\n" +
- "
\n" +
- "
\n" +
- "
\n" +
- "
\n" +
- " \n" +
- " - 独自等待 发表于 2013-04-22 11:58
\n" +
- " - 我也一直很期待这个领域能有一个大师兄,既牵着马,又挑着担,还能吓退妖怪,完成阿什顿伊顿般十项全能然后华丽转身,但,仅仅也就是期待罢了
\n" +
- "
\n" +
- " \n" +
- "
\n" +
- "
\n" +
- "
\n" +
- "
\n" +
- "
\n" +
- "
\n" +
- " \n" +
- " - FT中文网 发表于 2013-04-22 11:58
\n" +
- " - 参照汶川地震损失的5%计算,按照比较法的原则估计,雅安地震造成的直接经济损失大致为422.6亿,这也是国内近年来事件损失最大的一次自然灾害
\n" +
- "
\n" +
- " \n" +
- "
\n" +
- "
\n" +
- "
\n" +
- "
\n" +
- "
\n" +
- "
\n" +
- " \n" +
- " - 虎嗅 发表于 2013-04-22 11:29
\n" +
- " - 苹果是散户投资者最喜欢的个股,分析师指出苹果的增长陷入停顿才是大问题。市场在用自己的方式和蒂姆・库克进行沟通,每次他公开发表观点,股价就下跌,无一幸免。
\n" +
- "
\n" +
- " \n" +
- "
\n" +
- "
\n" +
- "
\n" +
- "
\n" +
- "
\n" +
- "
\n" +
- " \n" +
- " - 周宁 发表于 2013-04-22 10:06
\n" +
- " - ①一句报网互动不能解决问题;②产品必须要有自身特色;③不要随意去学别人的定位;④19楼的盈利模式不好学;⑤19楼的团队你学不会!
\n" +
- "
\n" +
- " \n" +
- "
\n" +
- "
\n" +
- "
\n" +
- "
\n" +
- "
\n" +
- "
\n" +
- " \n" +
- " - laptopmag.com 发表于 2013-04-22 09:43
\n" +
- " - iCar成为现实的希望越来越渺茫,后乔布斯时代的苹果公司正努力将Siri等技术融入现有的汽车产品中去
\n" +
- "
\n" +
- " \n" +
- "
\n" +
- "
\n" +
- "
\n" +
- "
\n" +
- "
\n" +
- "
\n" +
- " \n" +
- " - QUARTZ 发表于 2013-04-22 09:39
\n" +
- " - 抢先发布,而不关心是否属实,媒体啊、媒体!在一场公众眼球的“盛宴”中,对那些真正重要的问题选择集体漠视。争夺注意力有那么重要吗?有吗?
\n" +
- "
\n" +
- " \n" +
- "
\n" +
- "
\n" +
- "
\n" +
- "
\n" +
- "
\n" +
- "
\n" +
- " \n" +
- " - Business Insider 发表于 2013-04-22 07:46
\n" +
- " - 杰夫・贝索斯多次向这位软件工程师伸出橄榄枝,终于将其招至帐下,也成功研发出亚马逊有史以来最赚钱的项目
\n" +
- "
\n" +
- " \n" +
- "
\n" +
- "
\n" +
- "
\n" +
- "
\n" +
- "
\n" +
- "
\n" +
- " \n" +
- " - 虎嗅 发表于 2013-04-20 16:18
\n" +
- " - 在@中国红十字会总会的倡议微博下,却是数万声的“滚”
\n" +
- "
\n" +
- " \n" +
- "
\n" +
- "
\n" +
- "
\n" +
- "
\n" +
- "
\n" +
- "
\n" +
- " \n" +
- " - TIME.com 发表于 2013-04-20 07:27
\n" +
- " - 这些人从科技出发,影响了“时代”,就像逝去的乔布斯曾说的,“在宇宙中留下一道烙印”。无一不具备冒险与探索精神。
\n" +
- "
\n" +
- " \n" +
- "
\n" +
- "
\n" +
- "
\n" +
- "
更多
\n" +
- "
\n" +
- "
\n" +
- "
\n" +
- "
\n" +
- "
\n" +
- " - \n" +
- " \n" +
- "
\n" +
- "
\n" +
- "
\n" +
- "
2013-04-22\n" +
- "
\n" +
- " \n" +
- " - 现在的网站文案已经越来越有人情味了。例如提示文案不是“你的账号密码错误”而是“密码不对哦”,文案中增加了语气词。文案内容的情感化也会增加用户的接受程度
\n" +
- " \n" +
- "
\n" +
- "
\n" +
- "
\n" +
- " - \n" +
- " \n" +
- "
\n" +
- "
\n" +
- "
\n" +
- "
2013-04-22\n" +
- "
\n" +
- " \n" +
- " - 不做秀、不食言、不攀比,是总的原则;另外还有三个考虑:通过微博树立什么样的企业形象?怎么调动员工参与微博发布?怎么选择合作伙伴?
\n" +
- " \n" +
- "
\n" +
- "
\n" +
- "
\n" +
- " - \n" +
- " \n" +
- "
\n" +
- "
\n" +
- "
\n" +
- "
2013-04-22\n" +
- "
\n" +
- " \n" +
- " - 在3.0时代,每个卖家必须更注重店铺的差异化、个性化,做有特色的店铺。在运营店铺的过程中,要带上感情、资讯、消费文化等附加价值的东西
\n" +
- " \n" +
- "
\n" +
- "
\n" +
- "
\n" +
- " - \n" +
- " \n" +
- "
\n" +
- "
\n" +
- "
\n" +
- "
2013-04-22\n" +
- "
\n" +
- " \n" +
- " - 本次四川雅安地震之后,网上谣言少了,辟谣的多了;传谣的少了,不信谣的多了;阴谋论少了,正能量多了;博眼球的企业少了,做实事的企业多了
\n" +
- " \n" +
- "
\n" +
- "
\n" +
- "
\n" +
- " - \n" +
- " \n" +
- "
\n" +
- "
\n" +
- "
\n" +
- "
2013-04-22\n" +
- "
\n" +
- " \n" +
- " - 淘宝无线加入“微淘”,与微信公众平台类似,卖家自营账号,通过账号运营转化订单;天猫无线采用全新订阅模式,由用户自主选择订阅账号
\n" +
- " \n" +
- "
\n" +
- "
\n" +
- "
\n" +
- " - \n" +
- " \n" +
- "
\n" +
- "
\n" +
- "
\n" +
- "
2013-04-21\n" +
- "
\n" +
- " \n" +
- " - 寻人不是不好,但如果成了一种形式大于内容、资源不是最优化配置的方式,那还不如打破陈旧思维看看有没有更创新的方式
\n" +
- " \n" +
- "
\n" +
- "
\n" +
- "
\n" +
- " - \n" +
- " \n" +
- "
\n" +
- "
\n" +
- "
\n" +
- "
2013-04-21\n" +
- "
\n" +
- " \n" +
- " - 这一次,又是举国沸腾,但一些认知误区仍然存在。我想就这五年来的观察,做一些简单的总结,希望对关心灾区的朋友有些帮助和启发
\n" +
- " \n" +
- "
\n" +
- "
\n" +
- "
\n" +
- " - \n" +
- " \n" +
- "
\n" +
- "
\n" +
- "
\n" +
- "
2013-04-20\n" +
- "
\n" +
- " \n" +
- " - SNS和移动互联网是电子商务的两翼,在与电商的融合中二者才能找到赢利模式
\n" +
- " \n" +
- "
\n" +
- "
\n" +
- "
\t
\n" +
- "\t\t\n" +
- "\t
\n" +
- "\t
\n" +
- "\t\t
\n" +
- "\t\t\t
\n" +
- "\t\t\t
\n" +
- "\t\t\t\t
\n" +
- "\t\t\t\t\t- 官方微信
\n" +
- "\t\t\t\t\t- 微信扫描二维码,
获得每日精选资讯 \n" +
- "\t\t\t\t
\n" +
- "\t\t\t
\n" +
- "\t\t
\n" +
- "\t\t
\n" +
- "\t\t\t
\n" +
- "\t\t\t\t- 官方微博
\n" +
- "\t\t\t\t\n" +
- "\t\t\t\t\n" +
- "\t\t\t
\n" +
- "\t\t
\n" +
- "\t
\n" +
- "
\n" +
- "
\n" +
- "
\n" +
- "
\n" +
- "
\n" +
- "
\n" +
- "
\n" +
- "
\n" +
- "
\n" +
- "
\n" +
- "
\n" +
- "
\n" +
- "
\n" +
- "
\n" +
- "
\n" +
- "
\n" +
- "
\n" +
- "
\n" +
- "
\n" +
- "
\n" +
- "
\n" +
- "
\n" +
- "
\n" +
- "
\n" +
- "
\n" +
- "
\n" +
- "
\n" +
- "
\n" +
- "
\n" +
- "
\n" +
- "
\n" +
- " \n" +
- "
\n" +
- "
\n" +
- "\n" +
- "
\t
\n" +
- "\n" +
- "\n" +
- "\n" +
- "
\n" +
- "
\n" +
- "\n" +
- "

\n" +
- "\n" +
- "回顶部\n" +
- "\n" +
- "\t\t\t\n" +
- "\t\t\t\n" +
- "\t\t\t\n" +
- "\n";
- String newHtml = UrlUtils.fixAllRelativeHrefs(html, "http://www.huxiu.com/");
- Assert.assertTrue(html.contains("
+ * Extract an object of more than one pages, such as news and articles.
*
* @author code4crafter@gmail.com
* @since 0.2.0
diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/AfterExtractor.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/AfterExtractor.java
index 91c603f2..a7aafb36 100644
--- a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/AfterExtractor.java
+++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/AfterExtractor.java
@@ -3,7 +3,7 @@ package us.codecraft.webmagic.model;
import us.codecraft.webmagic.Page;
/**
- * Interface to be implemented by page models that need to do something after fields are extracted。
+ * Interface to be implemented by page models that need to do something after fields are extracted.
*
* @author code4crafter@gmail.com
* @since 0.2.0
diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/OOSpider.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/OOSpider.java
index d7f2b870..3cee9adb 100644
--- a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/OOSpider.java
+++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/OOSpider.java
@@ -5,7 +5,7 @@ import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.processor.PageProcessor;
/**
- * The spider for page model extractor。
+ * The spider for page model extractor.
* In webmagic, we call a POJO containing extract result as "page model".
* You can customize a crawler by write a page model with annotations.
* Such as:
diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ExtractBy.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ExtractBy.java
index 4bbebf68..4f66deb0 100644
--- a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ExtractBy.java
+++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ExtractBy.java
@@ -5,7 +5,7 @@ import java.lang.annotation.Retention;
import java.lang.annotation.Target;
/**
- * Define the extractor for field or class。
+ * Define the extractor for field or class.
*
* @author code4crafter@gmail.com
* @since 0.2.0
diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/JsonFilePageModelPipeline.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/JsonFilePageModelPipeline.java
index 3be53d36..228ec8c8 100644
--- a/webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/JsonFilePageModelPipeline.java
+++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/JsonFilePageModelPipeline.java
@@ -14,7 +14,7 @@ import java.io.IOException;
import java.io.PrintWriter;
/**
- * Store results objects (page models) to files in JSON format。
+ * Store results objects (page models) to files in JSON format.
* Use model.getKey() as file name if the model implements HasKey.
* Otherwise use SHA1 as file name.
*
diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/JsonFilePipeline.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/JsonFilePipeline.java
index 03313a95..625313f0 100644
--- a/webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/JsonFilePipeline.java
+++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/JsonFilePipeline.java
@@ -13,7 +13,7 @@ import java.io.IOException;
import java.io.PrintWriter;
/**
- * Store results to files in JSON format。
+ * Store results to files in JSON format.
*
* @author code4crafter@gmail.com
* @since 0.2.0
diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java
index e0912dee..e1916279 100644
--- a/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java
+++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java
@@ -9,7 +9,7 @@ import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Task;
/**
- * Use Redis as url scheduler for distributed crawlers。
+ * Use Redis as url scheduler for distributed crawlers.
*
* @author code4crafter@gmail.com
* @since 0.2.0
diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/utils/DoubleKeyMap.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/utils/DoubleKeyMap.java
index ba763c08..4aa6e040 100755
--- a/webmagic-extension/src/main/java/us/codecraft/webmagic/utils/DoubleKeyMap.java
+++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/utils/DoubleKeyMap.java
@@ -92,7 +92,6 @@ public class DoubleKeyMap extends MultiKeyMapBase {
return null;
}
V remove = get(key1).remove(key2);
- // 如果上一级map为空,把它也回收掉
if (get(key1).size() == 0) {
remove(key1);
}