diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/DuplicateRemovedScheduler.java b/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/DuplicateRemovedScheduler.java index 9be7adb5..6b7ebae6 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/DuplicateRemovedScheduler.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/DuplicateRemovedScheduler.java @@ -6,6 +6,7 @@ import us.codecraft.webmagic.Request; import us.codecraft.webmagic.Task; import us.codecraft.webmagic.scheduler.component.DuplicateRemover; import us.codecraft.webmagic.scheduler.component.HashSetDuplicateRemover; +import us.codecraft.webmagic.utils.HttpConstant; /** * Remove duplicate urls and only push urls which are not duplicate.

@@ -31,7 +32,7 @@ public abstract class DuplicateRemovedScheduler implements Scheduler { @Override public void push(Request request, Task task) { logger.trace("get a candidate url {}", request.getUrl()); - if (!duplicatedRemover.isDuplicate(request, task) || shouldReserved(request)) { + if (!duplicatedRemover.isDuplicate(request, task) || shouldReserved(request) || noNeedToRemoveDuplicate(request)) { logger.debug("push to queue {}", request.getUrl()); pushWhenNoDuplicate(request, task); } @@ -41,6 +42,10 @@ public abstract class DuplicateRemovedScheduler implements Scheduler { return request.getExtra(Request.CYCLE_TRIED_TIMES) != null; } + protected boolean noNeedToRemoveDuplicate(Request request) { + return HttpConstant.Method.POST.equalsIgnoreCase(request.getMethod()); + } + protected void pushWhenNoDuplicate(Request request, Task task) { }