diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/ReplaceSelector.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/ReplaceSelector.java index 567dcda3..5b63a4b4 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/ReplaceSelector.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/ReplaceSelector.java @@ -6,7 +6,7 @@ import java.util.regex.Pattern; import java.util.regex.PatternSyntaxException; /** - * Replace selector。
+ * Replace selector.
* * @author code4crafter@gmail.com
* @since 0.1.0 diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/XpathSelector.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/XpathSelector.java index a2eab3d9..c0e428cb 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/XpathSelector.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/XpathSelector.java @@ -6,7 +6,7 @@ import java.util.ArrayList; import java.util.List; /** - * XPath selector based on HtmlCleaner。
+ * XPath selector based on HtmlCleaner.
* * @author code4crafter@gmail.com
* @since 0.1.0 diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/utils/UrlUtilsTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/utils/UrlUtilsTest.java index 7ac7aa06..d1cbc21e 100644 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/utils/UrlUtilsTest.java +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/utils/UrlUtilsTest.java @@ -28,607 +28,6 @@ public class UrlUtilsTest { fixrelativeurl = UrlUtils.canonicalizeUrl("../../aa", "http://www.dianping.com/sh/ss/com"); } - @Test - public void testFixRelativeHtml(){ - String html = "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "虎嗅网\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "
\n" + - "\t\t\t\n" + - "\t\n" + - "
\n" + - "

\"虎嗅网\"

\n" + - "\n" + - "
\n" + - "
\n" + - "
\n" + - "订阅虎嗅\n" + - "RSS\n" + - "
\n" + - "\n" + - "
\n" + - "\n" + - "
\n" + - "\n" + - "
\n" + - "
\n" + - "\n" + - "
\n" + - "
\n" + - "
\n" + - "
\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "
\n" + - "
\n" + - "
\n" + - "\n" + - "\n" + - "\n" + - "
\n" + - "
\n" + - "
\n" + - "
\n" + - "\n" + - "\n" + - "
\n" + - "
\n" + - "
\n" + - "
\n" + - "
\n" + - "
\n" + - "
\n" + - "

震后48小时,互联网公司行动启示录

\n" + - "

在公益产品开发上,互联网合作开放共享一面应得体现,商业竞争一面则应被冲淡

\n" + - " \"震后48小时,互联网公司行动启示录\"\n" + - "
\n" + - "
\n" + - " \n" + - "
\n" + - "
\n" + - "
\n" + - "
\"李经纬逝世,围绕他展开的一个时代和三个男人\"/
\n" + - "
\n" + - " \n" + - " \n" + - "
\n" + - "
\n" + - "
\n" + - "
\"今日嗅评:一切都应在灾难面前握手言和\"/
\n" + - "
\n" + - " \n" + - " \n" + - "
\n" + - "
\n" + - "
\n" + - "
\"娜拉出走以后怎么办?读《从理想主义到经验主义》,向自由致敬\"/
\n" + - "
\n" + - " \n" + - " \n" + - "
\n" + - "
\n" + - "
\n" + - "
\"【每日移动观察】Kindle手机可期?\"/
\n" + - "
\n" + - " \n" + - " \n" + - "
\n" + - "
\n" + - "
\n" + - "
\"地震后,为什么手机不通微信通?\"/
\n" + - "
\n" + - "
    \n" + - "
  • 地震后,为什么手机不通微信通?

  • \n" + - "
  • 北京晨报 发表于 2013-04-22 07:23
  • \n" + - "
  • 微信的工作原理是分组交换的业务模式。它经过压缩处理,占用的通道可宽可窄,信息可以一站站推送,有传输空间时再送出。在同等网络条件下,微信占用的网络资源要小得多
  • \n" + - "
\n" + - " \n" + - "
\n" + - "
\n" + - "
\n" + - "
\"施密特:国家是具有垄断地位的服务提供商\"/
\n" + - "
\n" + - "
    \n" + - "
  • 施密特:国家是具有垄断地位的服务提供商

  • \n" + - "
  • Guardian 发表于 2013-04-22 15:51
  • \n" + - "
  • 国家提供了统一的规则。国家会制定实体政策和虚拟政策,这种二元性——网络空间实施一种战略,实体空间又部署另外一种战略——是可能的。
  • \n" + - "
\n" + - " \n" + - "
\n" + - "
\n" + - "
\n" + - "
\"库克位子成疑。这就是华尔街\"/
\n" + - "
\n" + - "
    \n" + - "
  • 库克位子成疑。这就是华尔街

  • \n" + - "
  • Hotashang 发表于 2013-04-22 14:23
  • \n" + - "
  • 华尔街就是这样的一个地方,在长期投资预期与短视利润之间纠结。而作为公司的CEO,也会经常在这种长期投资预期与短视利润之间摇摆
  • \n" + - "
\n" + - " \n" + - "
\n" + - "
\n" + - "
\n" + - "
\"跨国公司全球声誉排名:苹果跌破前十,宝马高居榜首\"/
\n" + - "
\n" + - " \n" + - " \n" + - "
\n" + - "
\n" + - "
\n" + - "
\"我为什么说生鲜电商是个伪命题?\"/
\n" + - "
\n" + - "
    \n" + - "
  • 我为什么说生鲜电商是个伪命题?

  • \n" + - "
  • 独自等待 发表于 2013-04-22 11:58
  • \n" + - "
  • 我也一直很期待这个领域能有一个大师兄,既牵着马,又挑着担,还能吓退妖怪,完成阿什顿伊顿般十项全能然后华丽转身,但,仅仅也就是期待罢了
  • \n" + - "
\n" + - " \n" + - "
\n" + - "
\n" + - "
\n" + - "
\"怎么估算雅安芦山县地震的经济影响?422.6亿元\"/
\n" + - "
\n" + - " \n" + - " \n" + - "
\n" + - "
\n" + - "
\n" + - "
\"8个月,2500亿美元,蒂姆・库克一露面股票就跌\"/
\n" + - "
\n" + - "
    \n" + - "
  • 8个月,2500亿美元,蒂姆・库克一露面股票就跌

  • \n" + - "
  • 虎嗅 发表于 2013-04-22 11:29
  • \n" + - "
  • 苹果是散户投资者最喜欢的个股,分析师指出苹果的增长陷入停顿才是大问题。市场在用自己的方式和蒂姆・库克进行沟通,每次他公开发表观点,股价就下跌,无一幸免。
  • \n" + - "
\n" + - " \n" + - "
\n" + - "
\n" + - "
\n" + - "
\"为什么19楼你学不会?\"/
\n" + - "
\n" + - "
    \n" + - "
  • 为什么19楼你学不会?

  • \n" + - "
  • 周宁 发表于 2013-04-22 10:06
  • \n" + - "
  • ①一句报网互动不能解决问题;②产品必须要有自身特色;③不要随意去学别人的定位;④19楼的盈利模式不好学;⑤19楼的团队你学不会!
  • \n" + - "
\n" + - " \n" + - "
\n" + - "
\n" + - "
\n" + - "
\"iCar?很难指望了!可是苹果对汽车仍有野望\"/
\n" + - "
\n" + - " \n" + - " \n" + - "
\n" + - "
\n" + - "
\n" + - "
\"网络时代,重大突发新闻报道为何一错再错?\"/
\n" + - "
\n" + - "
    \n" + - "
  • 网络时代,重大突发新闻报道为何一错再错?

  • \n" + - "
  • QUARTZ 发表于 2013-04-22 09:39
  • \n" + - "
  • 抢先发布,而不关心是否属实,媒体啊、媒体!在一场公众眼球的“盛宴”中,对那些真正重要的问题选择集体漠视。争夺注意力有那么重要吗?有吗?
  • \n" + - "
\n" + - " \n" + - "
\n" + - "
\n" + - "
\n" + - "
\"亚马逊首位数据挖掘负责人往事:开发出亚马逊最赚钱项目\"/
\n" + - "
\n" + - " \n" + - " \n" + - "
\n" + - "
\n" + - "
\n" + - "
\"马化腾与马云为何将数百万捐款,都放入壹基金参与雅安救助?\"/
\n" + - "
\n" + - " \n" + - " \n" + - "
\n" + - "
\n" + - "
\n" + - "
\"这16人从科技出发,在时代中留下烙印\"/
\n" + - "
\n" + - "
    \n" + - "
  • 这16人从科技出发,在时代中留下烙印

  • \n" + - "
  • TIME.com 发表于 2013-04-20 07:27
  • \n" + - "
  • 这些人从科技出发,影响了“时代”,就像逝去的乔布斯曾说的,“在宇宙中留下一道烙印”。无一不具备冒险与探索精神。
  • \n" + - "
\n" + - " \n" + - "
\n" + - "
\n" + - "
\n" + - "
更多
\n" + - "
\n" + - "
\n" + - "
\n" + - "
\n" + - "
    \n" + - "
  • \n" + - "
    \n" + - "
    \n" + - "

    产品情感化设计的两个层面

    \n" + - "

    云瑞

    \n" + - " 2013-04-22\n" + - "
    \n" + - "
  • \n" + - "
  • 现在的网站文案已经越来越有人情味了。例如提示文案不是“你的账号密码错误”而是“密码不对哦”,文案中增加了语气词。文案内容的情感化也会增加用户的接受程度
  • \n" + - "
  • 评论(0) 产品 投稿
  • \n" + - "
\n" + - "
\n" + - " \n" + - "
\n" + - " \n" + - "
\n" + - "
    \n" + - "
  • \n" + - "
    \n" + - "
    \n" + - "

    这一次救灾,互联网好样的!

    \n" + - "

    葛甲

    \n" + - " 2013-04-22\n" + - "
    \n" + - "
  • \n" + - "
  • 本次四川雅安地震之后,网上谣言少了,辟谣的多了;传谣的少了,不信谣的多了;阴谋论少了,正能量多了;博眼球的企业少了,做实事的企业多了
  • \n" + - "
  • 评论(4) 公益
  • \n" + - "
\n" + - "
\n" + - " \n" + - "
\n" + - " \n" + - "
\n" + - "
    \n" + - "
  • \n" + - "
    \n" + - "
    \n" + - "

    关于地震和救灾的常见误区

    \n" + - "

    左志坚

    \n" + - " 2013-04-21\n" + - "
    \n" + - "
  • \n" + - "
  • 这一次,又是举国沸腾,但一些认知误区仍然存在。我想就这五年来的观察,做一些简单的总结,希望对关心灾区的朋友有些帮助和启发
  • \n" + - "
  • 评论(8) 公益
  • \n" + - "
\n" + - "
\n" + - " \n" + - "
\n" + - "
\t
\n" + - "\t\t\n" + - "\t
\n" + - "\t
\n" + - "\t\t
\n" + - "\t\t\t
\"官方微信\"
\n" + - "\t\t\t
\n" + - "\t\t\t\t
    \n" + - "\t\t\t\t\t
  • 官方微信
  • \n" + - "\t\t\t\t\t
  • 微信扫描二维码,
    获得每日精选资讯
  • \n" + - "\t\t\t\t
\n" + - "\t\t\t
\n" + - "\t\t
\n" + - "\t\t
\n" + - "\t\t\t
    \n" + - "\t\t\t\t
  • 官方微博
  • \n" + - "\t\t\t\t
  • \n" + - "\t\t\t\t
  • \n" + - "\t\t\t
\n" + - "\t\t
\n" + - "\t
\n" + - "
\n" + - " \n" + - " \n" + - " \n" + - " \n" + - " \n" + - " \n" + - "
\n" + - "
\n" + - "\n" + - "
\n" + - "\n" + - "
\t
\n" + - "\n" + - "\n" + - "
\n" + - "
\n" + - "

关于我们|加入我们|广告及服务|常见问题解答|提交建议\n" + - "\n" + - "

\n" + - "

Copyright © 虎嗅网\n" + - "( 京ICP备12013432 )

\n" + - "
\n" + - "
\n" + - "\n" + - " 
\n" + - "\n" + - "回顶部\n" + - "\n" + - "\t\t\t
\n" + - "\t\t\t\n" + - "\t\t\t\n" + - "\n"; - String newHtml = UrlUtils.fixAllRelativeHrefs(html, "http://www.huxiu.com/"); - Assert.assertTrue(html.contains(" + * Extract an object of more than one pages, such as news and articles.
* * @author code4crafter@gmail.com
* @since 0.2.0 diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/AfterExtractor.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/AfterExtractor.java index 91c603f2..a7aafb36 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/AfterExtractor.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/AfterExtractor.java @@ -3,7 +3,7 @@ package us.codecraft.webmagic.model; import us.codecraft.webmagic.Page; /** - * Interface to be implemented by page models that need to do something after fields are extracted。
+ * Interface to be implemented by page models that need to do something after fields are extracted.
* * @author code4crafter@gmail.com
* @since 0.2.0 diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/OOSpider.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/OOSpider.java index d7f2b870..3cee9adb 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/OOSpider.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/OOSpider.java @@ -5,7 +5,7 @@ import us.codecraft.webmagic.Spider; import us.codecraft.webmagic.processor.PageProcessor; /** - * The spider for page model extractor。
+ * The spider for page model extractor.
* In webmagic, we call a POJO containing extract result as "page model".
* You can customize a crawler by write a page model with annotations.
* Such as: diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ExtractBy.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ExtractBy.java index 4bbebf68..4f66deb0 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ExtractBy.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ExtractBy.java @@ -5,7 +5,7 @@ import java.lang.annotation.Retention; import java.lang.annotation.Target; /** - * Define the extractor for field or class。
+ * Define the extractor for field or class.
* * @author code4crafter@gmail.com
* @since 0.2.0 diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/JsonFilePageModelPipeline.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/JsonFilePageModelPipeline.java index 3be53d36..228ec8c8 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/JsonFilePageModelPipeline.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/JsonFilePageModelPipeline.java @@ -14,7 +14,7 @@ import java.io.IOException; import java.io.PrintWriter; /** - * Store results objects (page models) to files in JSON format。
+ * Store results objects (page models) to files in JSON format.
* Use model.getKey() as file name if the model implements HasKey.
* Otherwise use SHA1 as file name. * diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/JsonFilePipeline.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/JsonFilePipeline.java index 03313a95..625313f0 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/JsonFilePipeline.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/JsonFilePipeline.java @@ -13,7 +13,7 @@ import java.io.IOException; import java.io.PrintWriter; /** - * Store results to files in JSON format。
+ * Store results to files in JSON format.
* * @author code4crafter@gmail.com
* @since 0.2.0 diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java index e0912dee..e1916279 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java @@ -9,7 +9,7 @@ import us.codecraft.webmagic.Request; import us.codecraft.webmagic.Task; /** - * Use Redis as url scheduler for distributed crawlers。
+ * Use Redis as url scheduler for distributed crawlers.
* * @author code4crafter@gmail.com
* @since 0.2.0 diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/utils/DoubleKeyMap.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/utils/DoubleKeyMap.java index ba763c08..4aa6e040 100755 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/utils/DoubleKeyMap.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/utils/DoubleKeyMap.java @@ -92,7 +92,6 @@ public class DoubleKeyMap extends MultiKeyMapBase { return null; } V remove = get(key1).remove(key2); - // 如果上一级map为空,把它也回收掉 if (get(key1).size() == 0) { remove(key1); }