From 5215a492ccfe69bc83b6cba31f76211c5fef3dae Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Sat, 11 Mar 2017 11:26:13 +0800 Subject: [PATCH] remove duplicate check for POST request #484 --- .../webmagic/scheduler/DuplicateRemovedScheduler.java | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/DuplicateRemovedScheduler.java b/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/DuplicateRemovedScheduler.java index 9be7adb5..6b7ebae6 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/DuplicateRemovedScheduler.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/DuplicateRemovedScheduler.java @@ -6,6 +6,7 @@ import us.codecraft.webmagic.Request; import us.codecraft.webmagic.Task; import us.codecraft.webmagic.scheduler.component.DuplicateRemover; import us.codecraft.webmagic.scheduler.component.HashSetDuplicateRemover; +import us.codecraft.webmagic.utils.HttpConstant; /** * Remove duplicate urls and only push urls which are not duplicate.

@@ -31,7 +32,7 @@ public abstract class DuplicateRemovedScheduler implements Scheduler { @Override public void push(Request request, Task task) { logger.trace("get a candidate url {}", request.getUrl()); - if (!duplicatedRemover.isDuplicate(request, task) || shouldReserved(request)) { + if (!duplicatedRemover.isDuplicate(request, task) || shouldReserved(request) || noNeedToRemoveDuplicate(request)) { logger.debug("push to queue {}", request.getUrl()); pushWhenNoDuplicate(request, task); } @@ -41,6 +42,10 @@ public abstract class DuplicateRemovedScheduler implements Scheduler { return request.getExtra(Request.CYCLE_TRIED_TIMES) != null; } + protected boolean noNeedToRemoveDuplicate(Request request) { + return HttpConstant.Method.POST.equalsIgnoreCase(request.getMethod()); + } + protected void pushWhenNoDuplicate(Request request, Task task) { }