From 1db940a08829a9f9c09282229883df434edc8494 Mon Sep 17 00:00:00 2001 From: zhugw <13656635451@qq.com> Date: Thu, 11 Sep 2014 15:46:09 +0800 Subject: [PATCH] Update FileCacheQueueScheduler.java MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 在使用过程中发现urls.txt文件存在重复URL的情况,经跟踪源代码,发现初始化加载文件后,读取所有的url放入一集合中,但是之后添加待抓取URL时并未判断是否已存在该集合中(即文件中)了,故导致文件中重复URL的情况.故据此对源码做了修改,还请作者审阅. --- .../codecraft/webmagic/scheduler/FileCacheQueueScheduler.java | 2 ++ 1 file changed, 2 insertions(+) diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/FileCacheQueueScheduler.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/FileCacheQueueScheduler.java index 211b6989..2f49fb2b 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/FileCacheQueueScheduler.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/FileCacheQueueScheduler.java @@ -145,6 +145,8 @@ public class FileCacheQueueScheduler extends DuplicateRemovedScheduler implement if (!inited.get()) { init(task); } + if(urls.contains(request.getUrl())) //已存在此URL 表示已抓取过 跳过 + return; queue.add(request); fileUrlWriter.println(request.getUrl()); }