From 5d5f3bf20e08e37b44b2807021b056cbc696a5f9 Mon Sep 17 00:00:00 2001 From: Sutra Zhou Date: Mon, 24 Oct 2022 00:17:21 +0800 Subject: [PATCH 1/9] Update for next development version --- pom.xml | 2 +- webmagic-core/pom.xml | 2 +- webmagic-coverage/pom.xml | 2 +- webmagic-extension/pom.xml | 2 +- webmagic-samples/pom.xml | 2 +- webmagic-saxon/pom.xml | 2 +- webmagic-scripts/pom.xml | 2 +- webmagic-selenium/pom.xml | 2 +- 8 files changed, 8 insertions(+), 8 deletions(-) diff --git a/pom.xml b/pom.xml index 226c851d..25b0c3a8 100644 --- a/pom.xml +++ b/pom.xml @@ -1,7 +1,7 @@ us.codecraft - 0.7.6 + 0.7.7-SNAPSHOT 4.0.0 pom diff --git a/webmagic-core/pom.xml b/webmagic-core/pom.xml index fe1ff12c..d2cf2cd3 100644 --- a/webmagic-core/pom.xml +++ b/webmagic-core/pom.xml @@ -3,7 +3,7 @@ us.codecraft webmagic-parent - 0.7.6 + 0.7.7-SNAPSHOT 4.0.0 diff --git a/webmagic-coverage/pom.xml b/webmagic-coverage/pom.xml index 289d2759..cff8f74e 100644 --- a/webmagic-coverage/pom.xml +++ b/webmagic-coverage/pom.xml @@ -8,7 +8,7 @@ us.codecraft webmagic-parent - 0.7.6 + 0.7.7-SNAPSHOT webmagic-coverage diff --git a/webmagic-extension/pom.xml b/webmagic-extension/pom.xml index fc5d9b76..335d47df 100644 --- a/webmagic-extension/pom.xml +++ b/webmagic-extension/pom.xml @@ -3,7 +3,7 @@ us.codecraft webmagic-parent - 0.7.6 + 0.7.7-SNAPSHOT 4.0.0 diff --git a/webmagic-samples/pom.xml b/webmagic-samples/pom.xml index c726c07b..c216ac6e 100644 --- a/webmagic-samples/pom.xml +++ b/webmagic-samples/pom.xml @@ -3,7 +3,7 @@ webmagic-parent us.codecraft - 0.7.6 + 0.7.7-SNAPSHOT 4.0.0 diff --git a/webmagic-saxon/pom.xml b/webmagic-saxon/pom.xml index 893fc0b7..f1951edf 100644 --- a/webmagic-saxon/pom.xml +++ b/webmagic-saxon/pom.xml @@ -3,7 +3,7 @@ webmagic-parent us.codecraft - 0.7.6 + 0.7.7-SNAPSHOT 4.0.0 diff --git a/webmagic-scripts/pom.xml b/webmagic-scripts/pom.xml index 0022a43b..b9e8e435 100644 --- a/webmagic-scripts/pom.xml +++ b/webmagic-scripts/pom.xml @@ -3,7 +3,7 @@ webmagic-parent us.codecraft - 0.7.6 + 0.7.7-SNAPSHOT 4.0.0 diff --git a/webmagic-selenium/pom.xml b/webmagic-selenium/pom.xml index 36ded000..ff193caf 100644 --- a/webmagic-selenium/pom.xml +++ b/webmagic-selenium/pom.xml @@ -3,7 +3,7 @@ webmagic-parent us.codecraft - 0.7.6 + 0.7.7-SNAPSHOT 4.0.0 From ac912e8f1fade5be3b0d8df521819f4b01ec6fba Mon Sep 17 00:00:00 2001 From: Joe Zhou Date: Sat, 12 Nov 2022 10:17:36 +0800 Subject: [PATCH 2/9] Revise QueueScheduler to support capacity-restricted. --- .../webmagic/scheduler/QueueScheduler.java | 23 +++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/QueueScheduler.java b/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/QueueScheduler.java index f9ad0e98..8ea3ab19 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/QueueScheduler.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/QueueScheduler.java @@ -16,11 +16,30 @@ import java.util.concurrent.LinkedBlockingQueue; */ public class QueueScheduler extends DuplicateRemovedScheduler implements MonitorableScheduler { - private BlockingQueue queue = new LinkedBlockingQueue(); + private final BlockingQueue queue; + + public QueueScheduler() { + this.queue = new LinkedBlockingQueue<>(); + } + + /** + * Creates a {@code QueueScheduler} with the given (fixed) capacity. + * + * @param capacity the capacity of this queue, + * see {@link LinkedBlockingQueue#LinkedBlockingQueue(int)} + * @since 0.8.0 + */ + public QueueScheduler(int capacity) { + this.queue = new LinkedBlockingQueue<>(capacity); + } @Override public void pushWhenNoDuplicate(Request request, Task task) { - queue.add(request); + try { + queue.put(request); + } catch (InterruptedException e) { + Thread.currentThread().interrupt(); + } } @Override From 075b98291bbc920fb3d49957778e633bb9a3d205 Mon Sep 17 00:00:00 2001 From: Joe Zhou Date: Sat, 12 Nov 2022 11:06:08 +0800 Subject: [PATCH 3/9] Return spider in setEmptySleepTime itself for chainning. --- .../src/main/java/us/codecraft/webmagic/Spider.java | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java index fd35f772..9f9201ee 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java @@ -333,9 +333,10 @@ public class Spider implements Runnable, Task { } } else { // wait until new url added, - if (waitNewUrl()) - //if interrupted + if (waitNewUrl()) { + //if interrupted break; + } continue; } } @@ -805,11 +806,13 @@ public class Spider implements Runnable, Task { * Set wait time when no url is polled.

* * @param emptySleepTime In MILLISECONDS. + * @return this */ - public void setEmptySleepTime(long emptySleepTime) { + public Spider setEmptySleepTime(long emptySleepTime) { if(emptySleepTime<=0){ throw new IllegalArgumentException("emptySleepTime should be more than zero!"); } this.emptySleepTime = emptySleepTime; + return this; } } From 4915431845ac035bc5b9379c809edfb4a0f19603 Mon Sep 17 00:00:00 2001 From: Joe Zhou Date: Tue, 15 Nov 2022 22:48:02 +0800 Subject: [PATCH 4/9] Revise logging level from warn to info, as we have passed the exception to onError. refs #1094 --- .../codecraft/webmagic/downloader/HttpClientDownloader.java | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java index f138b200..72821f3c 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java @@ -82,12 +82,16 @@ public class HttpClientDownloader extends AbstractDownloader { try { httpResponse = httpClient.execute(requestContext.getHttpUriRequest(), requestContext.getHttpClientContext()); page = handleResponse(request, request.getCharset() != null ? request.getCharset() : task.getSite().getCharset(), httpResponse, task); + onSuccess(request, task); logger.info("downloading page success {}", request.getUrl()); + return page; } catch (IOException e) { - logger.warn("download page {} error", request.getUrl(), e); + onError(request, task, e); + logger.info("download page {} error", request.getUrl(), e); + return page; } finally { if (httpResponse != null) { From e735e4e585f965ba3dabbd2faae3ad6665a4681b Mon Sep 17 00:00:00 2001 From: Joe Zhou Date: Sun, 20 Nov 2022 18:31:36 +0800 Subject: [PATCH 5/9] Log the remaining capacity. --- .../us/codecraft/webmagic/scheduler/QueueScheduler.java | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/QueueScheduler.java b/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/QueueScheduler.java index 8ea3ab19..04d5b36b 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/QueueScheduler.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/QueueScheduler.java @@ -1,11 +1,10 @@ package us.codecraft.webmagic.scheduler; -import us.codecraft.webmagic.Request; -import us.codecraft.webmagic.Task; - import java.util.concurrent.BlockingQueue; import java.util.concurrent.LinkedBlockingQueue; +import us.codecraft.webmagic.Request; +import us.codecraft.webmagic.Task; /** * Basic Scheduler implementation.
@@ -35,6 +34,8 @@ public class QueueScheduler extends DuplicateRemovedScheduler implements Monitor @Override public void pushWhenNoDuplicate(Request request, Task task) { + logger.trace("Remaining capacity: {}", this.queue.remainingCapacity()); + try { queue.put(request); } catch (InterruptedException e) { From 64e6a9800a38ceb3e57f2f9f360b7212c2cc61c2 Mon Sep 17 00:00:00 2001 From: Joe Zhou Date: Thu, 24 Nov 2022 00:26:09 +0800 Subject: [PATCH 6/9] Add dead-lock note for QueueScheduler. --- .../java/us/codecraft/webmagic/scheduler/QueueScheduler.java | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/QueueScheduler.java b/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/QueueScheduler.java index 04d5b36b..19d3bc73 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/QueueScheduler.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/QueueScheduler.java @@ -4,12 +4,17 @@ import java.util.concurrent.BlockingQueue; import java.util.concurrent.LinkedBlockingQueue; import us.codecraft.webmagic.Request; +import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Task; /** * Basic Scheduler implementation.
* Store urls to fetch in LinkedBlockingQueue and remove duplicate urls by HashMap. * + * Note: if you use this {@link QueueScheduler} + * with {@link Site#getCycleRetryTimes()} enabled, you may encountered dead-lock + * when the queue is full. + * * @author code4crafter@gmail.com
* @since 0.1.0 */ From 888682863c6bfd6a33b0314dcd9b672a50c80e2f Mon Sep 17 00:00:00 2001 From: Joe Zhou Date: Thu, 24 Nov 2022 00:30:07 +0800 Subject: [PATCH 7/9] Update versions for release --- pom.xml | 2 +- webmagic-core/pom.xml | 2 +- webmagic-coverage/pom.xml | 2 +- webmagic-extension/pom.xml | 2 +- webmagic-samples/pom.xml | 2 +- webmagic-saxon/pom.xml | 2 +- webmagic-scripts/pom.xml | 2 +- webmagic-selenium/pom.xml | 2 +- 8 files changed, 8 insertions(+), 8 deletions(-) diff --git a/pom.xml b/pom.xml index 25b0c3a8..215b483f 100644 --- a/pom.xml +++ b/pom.xml @@ -1,7 +1,7 @@ us.codecraft - 0.7.7-SNAPSHOT + 0.8.0 4.0.0 pom diff --git a/webmagic-core/pom.xml b/webmagic-core/pom.xml index d2cf2cd3..997eb812 100644 --- a/webmagic-core/pom.xml +++ b/webmagic-core/pom.xml @@ -3,7 +3,7 @@ us.codecraft webmagic-parent - 0.7.7-SNAPSHOT + 0.8.0 4.0.0 diff --git a/webmagic-coverage/pom.xml b/webmagic-coverage/pom.xml index cff8f74e..e2c0f741 100644 --- a/webmagic-coverage/pom.xml +++ b/webmagic-coverage/pom.xml @@ -8,7 +8,7 @@ us.codecraft webmagic-parent - 0.7.7-SNAPSHOT + 0.8.0 webmagic-coverage diff --git a/webmagic-extension/pom.xml b/webmagic-extension/pom.xml index 335d47df..05d6100a 100644 --- a/webmagic-extension/pom.xml +++ b/webmagic-extension/pom.xml @@ -3,7 +3,7 @@ us.codecraft webmagic-parent - 0.7.7-SNAPSHOT + 0.8.0 4.0.0 diff --git a/webmagic-samples/pom.xml b/webmagic-samples/pom.xml index c216ac6e..449fcf24 100644 --- a/webmagic-samples/pom.xml +++ b/webmagic-samples/pom.xml @@ -3,7 +3,7 @@ webmagic-parent us.codecraft - 0.7.7-SNAPSHOT + 0.8.0 4.0.0 diff --git a/webmagic-saxon/pom.xml b/webmagic-saxon/pom.xml index f1951edf..b73f6fd2 100644 --- a/webmagic-saxon/pom.xml +++ b/webmagic-saxon/pom.xml @@ -3,7 +3,7 @@ webmagic-parent us.codecraft - 0.7.7-SNAPSHOT + 0.8.0 4.0.0 diff --git a/webmagic-scripts/pom.xml b/webmagic-scripts/pom.xml index b9e8e435..3ec15f9a 100644 --- a/webmagic-scripts/pom.xml +++ b/webmagic-scripts/pom.xml @@ -3,7 +3,7 @@ webmagic-parent us.codecraft - 0.7.7-SNAPSHOT + 0.8.0 4.0.0 diff --git a/webmagic-selenium/pom.xml b/webmagic-selenium/pom.xml index ff193caf..715d7731 100644 --- a/webmagic-selenium/pom.xml +++ b/webmagic-selenium/pom.xml @@ -3,7 +3,7 @@ webmagic-parent us.codecraft - 0.7.7-SNAPSHOT + 0.8.0 4.0.0 From a7a06936f07152469daeaa85fd67a0b737231aa3 Mon Sep 17 00:00:00 2001 From: Joe Zhou Date: Thu, 24 Nov 2022 00:33:30 +0800 Subject: [PATCH 8/9] Fix requireMavenVersion. --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index 215b483f..8f928be6 100644 --- a/pom.xml +++ b/pom.xml @@ -232,7 +232,7 @@ - 3.3.9 + 3.5.0 From 7d091def55709609c2894f619aaa8518a641769e Mon Sep 17 00:00:00 2001 From: Joe Zhou Date: Thu, 24 Nov 2022 00:42:41 +0800 Subject: [PATCH 9/9] Upgrade fastjson, jruby, slf4j. --- pom.xml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pom.xml b/pom.xml index 8f928be6..68bf76d9 100644 --- a/pom.xml +++ b/pom.xml @@ -14,14 +14,14 @@ 4.4 2.11.0 3.12.0 - 2.0.14.graal + 2.0.19.graal 3.0.13 31.1-jre 2.26 4.5.13 4.4.15 3.7.1 - 9.3.8.0 + 9.3.9.0 2.7.0 4.13.2 2.7.3 @@ -31,7 +31,7 @@ 1.2.0 11.4 3.141.59 - 2.0.3 + 2.0.4 4.0.0.RELEASE 0.3.5