diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/FileCacheQueueScheduler.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/FileCacheQueueScheduler.java index 211b6989..2f49fb2b 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/FileCacheQueueScheduler.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/FileCacheQueueScheduler.java @@ -145,6 +145,8 @@ public class FileCacheQueueScheduler extends DuplicateRemovedScheduler implement if (!inited.get()) { init(task); } + if(urls.contains(request.getUrl())) //已存在此URL 表示已抓取过 跳过 + return; queue.add(request); fileUrlWriter.println(request.getUrl()); }