From a67f60b01b54f6789cba4d227d9292dda5b25ac0 Mon Sep 17 00:00:00 2001 From: Yao Date: Wed, 30 Aug 2017 18:42:00 +0800 Subject: [PATCH 01/44] fix the typo --- README-zh.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README-zh.md b/README-zh.md index cd1b090c..65d5d172 100644 --- a/README-zh.md +++ b/README-zh.md @@ -93,7 +93,7 @@ webmagic还包含两个可用的扩展包,因为这两个包都依赖了比较 PageProcessor是webmagic-core的一部分,定制一个PageProcessor即可实现自己的爬虫逻辑。以下是抓取osc博客的一段代码: ```java -public class OschinaBlogPageProcesser implements PageProcessor { +public class OschinaBlogPageProcessor implements PageProcessor { private Site site = Site.me().setDomain("my.oschina.net"); @@ -113,7 +113,7 @@ public class OschinaBlogPageProcesser implements PageProcessor { } public static void main(String[] args) { - Spider.create(new OschinaBlogPageProcesser()).addUrl("http://my.oschina.net/flashsword/blog") + Spider.create(new OschinaBlogPageProcessor()).addUrl("http://my.oschina.net/flashsword/blog") .addPipeline(new ConsolePipeline()).run(); } } From b539522ca8431d804b95b2ced414e5b43415e9f6 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Wed, 29 Nov 2017 13:36:19 +0800 Subject: [PATCH 02/44] #701 support to tls1.2 --- pom.xml | 5 ++++ .../downloader/HttpClientGenerator.java | 5 +++- .../downloader/HttpClientDownloaderTest.java | 1 + .../downloader/SSLCompatibilityTest.java | 26 +++++++++++++++++++ 4 files changed, 36 insertions(+), 1 deletion(-) create mode 100644 webmagic-core/src/test/java/us/codecraft/webmagic/downloader/SSLCompatibilityTest.java diff --git a/pom.xml b/pom.xml index 0765ae13..84ce1152 100644 --- a/pom.xml +++ b/pom.xml @@ -75,6 +75,11 @@ httpclient 4.5.2 + + org.apache.httpcomponents + httpcore + 4.5.2 + com.google.guava guava diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java index 562f36f6..28a16f41 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java @@ -9,6 +9,7 @@ import org.apache.http.config.RegistryBuilder; import org.apache.http.config.SocketConfig; import org.apache.http.conn.socket.ConnectionSocketFactory; import org.apache.http.conn.socket.PlainConnectionSocketFactory; +import org.apache.http.conn.ssl.DefaultHostnameVerifier; import org.apache.http.conn.ssl.SSLConnectionSocketFactory; import org.apache.http.impl.client.*; import org.apache.http.impl.conn.PoolingHttpClientConnectionManager; @@ -49,7 +50,9 @@ public class HttpClientGenerator { private SSLConnectionSocketFactory buildSSLConnectionSocketFactory() { try { - return new SSLConnectionSocketFactory(createIgnoreVerifySSL()); // 优先绕过安全证书 + return new SSLConnectionSocketFactory(createIgnoreVerifySSL(), new String[]{"SSLv3", "TLSv1", "TLSv1.1", "TLSv1.2"}, + null, + new DefaultHostnameVerifier()); // 优先绕过安全证书 } catch (KeyManagementException e) { logger.error("ssl connection fail", e); } catch (NoSuchAlgorithmException e) { diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpClientDownloaderTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpClientDownloaderTest.java index 04a45a02..ece06000 100644 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpClientDownloaderTest.java +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpClientDownloaderTest.java @@ -322,4 +322,5 @@ public class HttpClientDownloaderTest { }); } + } diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/SSLCompatibilityTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/SSLCompatibilityTest.java new file mode 100644 index 00000000..861b315a --- /dev/null +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/SSLCompatibilityTest.java @@ -0,0 +1,26 @@ +package us.codecraft.webmagic.downloader; + +import org.junit.Test; +import us.codecraft.webmagic.Page; +import us.codecraft.webmagic.Request; +import us.codecraft.webmagic.Site; +import us.codecraft.webmagic.Task; + +import static org.assertj.core.api.Assertions.assertThat; + +/** + * @author code4crafter@gmail.com + * Date: 2017/11/29 + * Time: 下午1:32 + */ +public class SSLCompatibilityTest { + + @Test + public void test_tls12() throws Exception { + HttpClientDownloader httpClientDownloader = new HttpClientDownloader(); + Task task = Site.me().setCycleRetryTimes(5).toTask(); + Request request = new Request("https://juejin.im/"); + Page page = httpClientDownloader.download(request, task); + assertThat(page.isDownloadSuccess()).isTrue(); + } +} From e5db538c19188902592ea2f702e0860fc3eba600 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Wed, 29 Nov 2017 13:49:40 +0800 Subject: [PATCH 03/44] #647 remove ThreadSafe annotation --- .../us/codecraft/webmagic/downloader/HttpClientDownloader.java | 2 -- .../main/java/us/codecraft/webmagic/pipeline/FilePipeline.java | 3 --- .../us/codecraft/webmagic/scheduler/PriorityScheduler.java | 2 -- .../java/us/codecraft/webmagic/scheduler/QueueScheduler.java | 2 -- .../us/codecraft/webmagic/downloader/PhantomJSDownloader.java | 2 -- 5 files changed, 11 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java index fff7c7cf..24889c88 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java @@ -2,7 +2,6 @@ package us.codecraft.webmagic.downloader; import org.apache.commons.io.IOUtils; import org.apache.http.HttpResponse; -import org.apache.http.annotation.ThreadSafe; import org.apache.http.client.methods.CloseableHttpResponse; import org.apache.http.impl.client.CloseableHttpClient; import org.apache.http.util.EntityUtils; @@ -30,7 +29,6 @@ import java.util.Map; * @author code4crafter@gmail.com
* @since 0.1.0 */ -@ThreadSafe public class HttpClientDownloader extends AbstractDownloader { private Logger logger = LoggerFactory.getLogger(getClass()); diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/FilePipeline.java b/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/FilePipeline.java index 57d6eea3..be9fd7cc 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/FilePipeline.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/FilePipeline.java @@ -1,10 +1,8 @@ package us.codecraft.webmagic.pipeline; import org.apache.commons.codec.digest.DigestUtils; -import org.apache.http.annotation.ThreadSafe; import org.slf4j.Logger; import org.slf4j.LoggerFactory; - import us.codecraft.webmagic.ResultItems; import us.codecraft.webmagic.Task; import us.codecraft.webmagic.utils.FilePersistentBase; @@ -21,7 +19,6 @@ import java.util.Map; * @author code4crafter@gmail.com
* @since 0.1.0 */ -@ThreadSafe public class FilePipeline extends FilePersistentBase implements Pipeline { private Logger logger = LoggerFactory.getLogger(getClass()); diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/PriorityScheduler.java b/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/PriorityScheduler.java index 8fa1b9ea..14cbaff3 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/PriorityScheduler.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/PriorityScheduler.java @@ -1,6 +1,5 @@ package us.codecraft.webmagic.scheduler; -import org.apache.http.annotation.ThreadSafe; import us.codecraft.webmagic.Request; import us.codecraft.webmagic.Task; import us.codecraft.webmagic.utils.NumberUtils; @@ -16,7 +15,6 @@ import java.util.concurrent.PriorityBlockingQueue; * @author code4crafter@gmail.com
* @since 0.2.1 */ -@ThreadSafe public class PriorityScheduler extends DuplicateRemovedScheduler implements MonitorableScheduler { public static final int INITIAL_CAPACITY = 5; diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/QueueScheduler.java b/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/QueueScheduler.java index 078506c6..f9ad0e98 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/QueueScheduler.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/QueueScheduler.java @@ -1,6 +1,5 @@ package us.codecraft.webmagic.scheduler; -import org.apache.http.annotation.ThreadSafe; import us.codecraft.webmagic.Request; import us.codecraft.webmagic.Task; @@ -15,7 +14,6 @@ import java.util.concurrent.LinkedBlockingQueue; * @author code4crafter@gmail.com
* @since 0.1.0 */ -@ThreadSafe public class QueueScheduler extends DuplicateRemovedScheduler implements MonitorableScheduler { private BlockingQueue queue = new LinkedBlockingQueue(); diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/downloader/PhantomJSDownloader.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/downloader/PhantomJSDownloader.java index 0fda351b..6055bdb0 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/downloader/PhantomJSDownloader.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/downloader/PhantomJSDownloader.java @@ -1,6 +1,5 @@ package us.codecraft.webmagic.downloader; -import org.apache.http.annotation.ThreadSafe; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import us.codecraft.webmagic.Page; @@ -16,7 +15,6 @@ import java.io.*; * @author dolphineor@gmail.com * @version 0.5.3 */ -@ThreadSafe public class PhantomJSDownloader extends AbstractDownloader { private static Logger logger = LoggerFactory.getLogger(PhantomJSDownloader.class); From 266083fa074819232a02d359566be81ff687da87 Mon Sep 17 00:00:00 2001 From: yihy <2100087178@qq.com> Date: Wed, 29 Nov 2017 20:19:00 +0800 Subject: [PATCH 04/44] =?UTF-8?q?[Fix]=20#698=C2=A0=20Repair=20using=20red?= =?UTF-8?q?is,Request=20additional=20information=20is=20lost?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../webmagic/scheduler/RedisScheduler.java | 35 ++++++++++++++++--- 1 file changed, 31 insertions(+), 4 deletions(-) diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java index ce1111f2..1e94971f 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java @@ -2,6 +2,7 @@ package us.codecraft.webmagic.scheduler; import com.alibaba.fastjson.JSON; import org.apache.commons.codec.digest.DigestUtils; +import org.apache.commons.lang3.StringUtils; import redis.clients.jedis.Jedis; import redis.clients.jedis.JedisPool; import redis.clients.jedis.JedisPoolConfig; @@ -60,7 +61,7 @@ public class RedisScheduler extends DuplicateRemovedScheduler implements Monitor Jedis jedis = pool.getResource(); try { jedis.rpush(getQueueKey(task), request.getUrl()); - if (request.getExtras() != null) { + if (CheckForAdditionalInfo(request)) { String field = DigestUtils.shaHex(request.getUrl()); String value = JSON.toJSONString(request); jedis.hset((ITEM_PREFIX + task.getUUID()), field, value); @@ -70,6 +71,33 @@ public class RedisScheduler extends DuplicateRemovedScheduler implements Monitor } } + private boolean CheckForAdditionalInfo(Request request) { + if (request == null) { + return false; + } + + if (!request.getHeaders().isEmpty() || !request.getCookies().isEmpty()) { + return true; + } + + if (StringUtils.isNotBlank(request.getCharset()) || StringUtils.isNotBlank(request.getMethod())) { + return true; + } + + if (request.isBinaryContent() || request.getRequestBody() != null) { + return true; + } + + if (request.getExtras() != null && !request.getExtras().isEmpty()) { + return true; + } + if (request.getPriority() != 0L) { + return true; + } + + return false; + } + @Override public synchronized Request poll(Task task) { Jedis jedis = pool.getResource(); @@ -85,7 +113,7 @@ public class RedisScheduler extends DuplicateRemovedScheduler implements Monitor Request o = JSON.parseObject(new String(bytes), Request.class); return o; } - Request request = new Request(url); + Request request = new Request(url); return request; } finally { pool.returnResource(jedis); @@ -100,8 +128,7 @@ public class RedisScheduler extends DuplicateRemovedScheduler implements Monitor return QUEUE_PREFIX + task.getUUID(); } - protected String getItemKey(Task task) - { + protected String getItemKey(Task task) { return ITEM_PREFIX + task.getUUID(); } From adf545483797392333135a3d7900b31b71110d9d Mon Sep 17 00:00:00 2001 From: yihy <2100087178@qq.com> Date: Thu, 30 Nov 2017 11:35:12 +0800 Subject: [PATCH 05/44] =?UTF-8?q?[Fix]=20=E4=BF=AE=E6=AD=A3=E9=94=99?= =?UTF-8?q?=E8=AF=AF=E6=96=B9=E6=B3=95=E5=90=8D?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../java/us/codecraft/webmagic/scheduler/RedisScheduler.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java index 1e94971f..ee04f35c 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java @@ -61,7 +61,7 @@ public class RedisScheduler extends DuplicateRemovedScheduler implements Monitor Jedis jedis = pool.getResource(); try { jedis.rpush(getQueueKey(task), request.getUrl()); - if (CheckForAdditionalInfo(request)) { + if (checkForAdditionalInfo(request)) { String field = DigestUtils.shaHex(request.getUrl()); String value = JSON.toJSONString(request); jedis.hset((ITEM_PREFIX + task.getUUID()), field, value); @@ -71,7 +71,7 @@ public class RedisScheduler extends DuplicateRemovedScheduler implements Monitor } } - private boolean CheckForAdditionalInfo(Request request) { + private boolean checkForAdditionalInfo(Request request) { if (request == null) { return false; } From c701fe8d38c8060e97df3efab64ae4a0d94c0245 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Thu, 30 Nov 2017 11:50:52 +0800 Subject: [PATCH 06/44] #702 Refactor: rename CheckForAdditionalInfo to checkForAdditionalInfo --- .../us/codecraft/webmagic/scheduler/RedisScheduler.java | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java index 1e94971f..c70d8850 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java @@ -61,17 +61,17 @@ public class RedisScheduler extends DuplicateRemovedScheduler implements Monitor Jedis jedis = pool.getResource(); try { jedis.rpush(getQueueKey(task), request.getUrl()); - if (CheckForAdditionalInfo(request)) { + if (checkForAdditionalInfo(request)) { String field = DigestUtils.shaHex(request.getUrl()); String value = JSON.toJSONString(request); jedis.hset((ITEM_PREFIX + task.getUUID()), field, value); } } finally { - pool.returnResource(jedis); + jedis.close(); } } - private boolean CheckForAdditionalInfo(Request request) { + private boolean checkForAdditionalInfo(Request request) { if (request == null) { return false; } From c7d1ed7d201515fbf479dcb62c612711af56070a Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Sat, 2 Dec 2017 10:50:49 +0800 Subject: [PATCH 07/44] #fix httpcore version: change to 4.4.4 --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index 84ce1152..2b2384fd 100644 --- a/pom.xml +++ b/pom.xml @@ -78,7 +78,7 @@ org.apache.httpcomponents httpcore - 4.5.2 + 4.4.4 com.google.guava From be892b80bf6682cd063d30ac25a79be0c079a901 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Sat, 2 Dec 2017 10:57:06 +0800 Subject: [PATCH 08/44] update travis ci to openjdk --- .travis.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index a9f233f3..9e6f78d3 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,3 +1,3 @@ language: java jdk: - - oraclejdk7 + - openjdk7 From f1b3a29d6ff09efefef343c7f3d697e51a3eeea5 Mon Sep 17 00:00:00 2001 From: snyk-test Date: Fri, 28 Jun 2019 01:31:36 +0000 Subject: [PATCH 09/44] fix: webmagic-selenium/pom.xml to reduce vulnerabilities The following vulnerabilities are fixed with an upgrade: - https://snyk.io/vuln/SNYK-JAVA-ORGAPACHEHTTPCOMPONENTS-31517 --- webmagic-selenium/pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/webmagic-selenium/pom.xml b/webmagic-selenium/pom.xml index 1cbf5921..e88cce5c 100644 --- a/webmagic-selenium/pom.xml +++ b/webmagic-selenium/pom.xml @@ -13,7 +13,7 @@ org.seleniumhq.selenium selenium-java - 2.41.0 + 3.0.0 us.codecraft From 2fd0e192fdef9316d93d101b4fc333ac9ae13fd2 Mon Sep 17 00:00:00 2001 From: Thomas Perkins Date: Mon, 29 Jul 2019 15:58:52 +0100 Subject: [PATCH 10/44] Add unit tests for us.codecraft.webmagic.utils.NumberUtils These tests were written using Diffblue Cover. --- .../webmagic/utils/NumberUtilsTest.java | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) create mode 100644 webmagic-core/src/test/java/us/codecraft/webmagic/utils/NumberUtilsTest.java diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/utils/NumberUtilsTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/utils/NumberUtilsTest.java new file mode 100644 index 00000000..f9e725e2 --- /dev/null +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/utils/NumberUtilsTest.java @@ -0,0 +1,16 @@ +package us.codecraft.webmagic.utils; + +import org.junit.Assert; +import org.junit.Test; + +public class NumberUtilsTest { + + @Test + public void testCompareLong() { + Assert.assertEquals(0, NumberUtils.compareLong(0L, 0L)); + Assert.assertEquals(1, NumberUtils.compareLong(9L, 0L)); + Assert.assertEquals(-1, NumberUtils.compareLong(0L, 9L)); + Assert.assertEquals(-1, NumberUtils.compareLong(-9L, 0L)); + Assert.assertEquals(1, NumberUtils.compareLong(0L, -9L)); + } +} From 0b8fab1bfa87429e2be6af6375e21be953bbd713 Mon Sep 17 00:00:00 2001 From: snyk-test Date: Mon, 2 Sep 2019 02:58:48 +0000 Subject: [PATCH 11/44] fix: pom.xml to reduce vulnerabilities The following vulnerabilities are fixed with an upgrade: - https://snyk.io/vuln/SNYK-JAVA-COMFASTERXMLJACKSONCORE-174736 - https://snyk.io/vuln/SNYK-JAVA-COMFASTERXMLJACKSONCORE-32043 - https://snyk.io/vuln/SNYK-JAVA-COMFASTERXMLJACKSONCORE-32044 - https://snyk.io/vuln/SNYK-JAVA-COMFASTERXMLJACKSONCORE-32111 - https://snyk.io/vuln/SNYK-JAVA-COMFASTERXMLJACKSONCORE-450207 - https://snyk.io/vuln/SNYK-JAVA-COMFASTERXMLJACKSONCORE-450917 - https://snyk.io/vuln/SNYK-JAVA-COMFASTERXMLJACKSONCORE-455617 - https://snyk.io/vuln/SNYK-JAVA-COMFASTERXMLJACKSONCORE-72445 - https://snyk.io/vuln/SNYK-JAVA-COMFASTERXMLJACKSONCORE-72446 - https://snyk.io/vuln/SNYK-JAVA-COMFASTERXMLJACKSONCORE-72447 - https://snyk.io/vuln/SNYK-JAVA-COMFASTERXMLJACKSONCORE-72882 - https://snyk.io/vuln/SNYK-JAVA-COMFASTERXMLJACKSONCORE-72883 - https://snyk.io/vuln/SNYK-JAVA-COMFASTERXMLJACKSONCORE-72884 - https://snyk.io/vuln/SNYK-JAVA-IONETTY-30433 - https://snyk.io/vuln/SNYK-JAVA-IONETTY-31515 --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index 2b2384fd..a93ebb3f 100644 --- a/pom.xml +++ b/pom.xml @@ -113,7 +113,7 @@ com.github.dreamhead moco-core - 0.11.0 + 1.0.0 test From 7fd58a94a8c3f566a78afddc06124609e916b6a2 Mon Sep 17 00:00:00 2001 From: snyk-bot Date: Thu, 3 Oct 2019 02:59:35 +0000 Subject: [PATCH 12/44] fix: pom.xml to reduce vulnerabilities The following vulnerabilities are fixed with an upgrade: - https://snyk.io/vuln/SNYK-JAVA-COMFASTERXMLJACKSONCORE-469674 - https://snyk.io/vuln/SNYK-JAVA-COMFASTERXMLJACKSONCORE-469676 --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index 2b2384fd..a93ebb3f 100644 --- a/pom.xml +++ b/pom.xml @@ -113,7 +113,7 @@ com.github.dreamhead moco-core - 0.11.0 + 1.0.0 test From fd6037af26c71a3b30d82382fcf680897ac6acc3 Mon Sep 17 00:00:00 2001 From: snyk-bot Date: Tue, 8 Oct 2019 02:58:45 +0000 Subject: [PATCH 13/44] fix: pom.xml to reduce vulnerabilities The following vulnerabilities are fixed with an upgrade: - https://snyk.io/vuln/SNYK-JAVA-COMFASTERXMLJACKSONCORE-471943 --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index 2b2384fd..a93ebb3f 100644 --- a/pom.xml +++ b/pom.xml @@ -113,7 +113,7 @@ com.github.dreamhead moco-core - 0.11.0 + 1.0.0 test From 09fb39f431e84eb4f87aaaef7abe506659ed8231 Mon Sep 17 00:00:00 2001 From: snyk-bot Date: Tue, 15 Oct 2019 02:58:57 +0000 Subject: [PATCH 14/44] fix: pom.xml to reduce vulnerabilities The following vulnerabilities are fixed with an upgrade: - https://snyk.io/vuln/SNYK-JAVA-COMFASTERXMLJACKSONCORE-472980 --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index 2b2384fd..a93ebb3f 100644 --- a/pom.xml +++ b/pom.xml @@ -113,7 +113,7 @@ com.github.dreamhead moco-core - 0.11.0 + 1.0.0 test From 8815934f88a49d8a96b82b9e6918d0f02cf9c7ff Mon Sep 17 00:00:00 2001 From: snyk-bot Date: Sat, 4 Jan 2020 02:58:56 +0000 Subject: [PATCH 15/44] fix: pom.xml to reduce vulnerabilities The following vulnerabilities are fixed with an upgrade: - https://snyk.io/vuln/SNYK-JAVA-COMFASTERXMLJACKSONCORE-540500 --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index 2b2384fd..a93ebb3f 100644 --- a/pom.xml +++ b/pom.xml @@ -113,7 +113,7 @@ com.github.dreamhead moco-core - 0.11.0 + 1.0.0 test From 8335d18c79decf58f41fe94b418dc720d9886f91 Mon Sep 17 00:00:00 2001 From: snyk-bot Date: Wed, 12 Feb 2020 02:58:51 +0000 Subject: [PATCH 16/44] fix: pom.xml to reduce vulnerabilities The following vulnerabilities are fixed with an upgrade: - https://snyk.io/vuln/SNYK-JAVA-COMFASTERXMLJACKSONCORE-548451 --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index 2b2384fd..a93ebb3f 100644 --- a/pom.xml +++ b/pom.xml @@ -113,7 +113,7 @@ com.github.dreamhead moco-core - 0.11.0 + 1.0.0 test From cc2d7af70a5a8adba06071494c1f9d87df7853ca Mon Sep 17 00:00:00 2001 From: snyk-bot Date: Tue, 3 Mar 2020 02:59:56 +0000 Subject: [PATCH 17/44] fix: pom.xml to reduce vulnerabilities The following vulnerabilities are fixed with an upgrade: - https://snyk.io/vuln/SNYK-JAVA-COMFASTERXMLJACKSONCORE-559094 --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index 2b2384fd..a93ebb3f 100644 --- a/pom.xml +++ b/pom.xml @@ -113,7 +113,7 @@ com.github.dreamhead moco-core - 0.11.0 + 1.0.0 test From c79cd1dfa07304dee4ac299fc131c7c888024315 Mon Sep 17 00:00:00 2001 From: snyk-bot Date: Sun, 22 Mar 2020 02:58:48 +0000 Subject: [PATCH 18/44] fix: pom.xml to reduce vulnerabilities The following vulnerabilities are fixed with an upgrade: - https://snyk.io/vuln/SNYK-JAVA-COMFASTERXMLJACKSONCORE-560762 - https://snyk.io/vuln/SNYK-JAVA-COMFASTERXMLJACKSONCORE-560766 --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index 2b2384fd..a93ebb3f 100644 --- a/pom.xml +++ b/pom.xml @@ -113,7 +113,7 @@ com.github.dreamhead moco-core - 0.11.0 + 1.0.0 test From 95da41f08b1c68e2c0ff04bd0d770b35db278e6c Mon Sep 17 00:00:00 2001 From: snyk-bot Date: Sat, 28 Mar 2020 02:58:44 +0000 Subject: [PATCH 19/44] fix: pom.xml to reduce vulnerabilities The following vulnerabilities are fixed with an upgrade: - https://snyk.io/vuln/SNYK-JAVA-COMFASTERXMLJACKSONCORE-560762 - https://snyk.io/vuln/SNYK-JAVA-COMFASTERXMLJACKSONCORE-560766 - https://snyk.io/vuln/SNYK-JAVA-COMFASTERXMLJACKSONCORE-561362 - https://snyk.io/vuln/SNYK-JAVA-COMFASTERXMLJACKSONCORE-561373 --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index 2b2384fd..a93ebb3f 100644 --- a/pom.xml +++ b/pom.xml @@ -113,7 +113,7 @@ com.github.dreamhead moco-core - 0.11.0 + 1.0.0 test From 38eedabef99b7c5e639f3fac6c408fa59ee8b091 Mon Sep 17 00:00:00 2001 From: snyk-bot Date: Wed, 1 Apr 2020 02:58:50 +0000 Subject: [PATCH 20/44] fix: pom.xml to reduce vulnerabilities The following vulnerabilities are fixed with an upgrade: - https://snyk.io/vuln/SNYK-JAVA-COMFASTERXMLJACKSONCORE-561585 - https://snyk.io/vuln/SNYK-JAVA-COMFASTERXMLJACKSONCORE-561586 - https://snyk.io/vuln/SNYK-JAVA-COMFASTERXMLJACKSONCORE-561587 --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index 2b2384fd..a93ebb3f 100644 --- a/pom.xml +++ b/pom.xml @@ -113,7 +113,7 @@ com.github.dreamhead moco-core - 0.11.0 + 1.0.0 test From 8327e482c9baa1038fcd89bcbe60d7ec625b7303 Mon Sep 17 00:00:00 2001 From: snyk-bot Date: Thu, 9 Apr 2020 02:58:45 +0000 Subject: [PATCH 21/44] fix: pom.xml to reduce vulnerabilities The following vulnerabilities are fixed with an upgrade: - https://snyk.io/vuln/SNYK-JAVA-COMFASTERXMLJACKSONCORE-564887 - https://snyk.io/vuln/SNYK-JAVA-COMFASTERXMLJACKSONCORE-564888 --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index 2b2384fd..a93ebb3f 100644 --- a/pom.xml +++ b/pom.xml @@ -113,7 +113,7 @@ com.github.dreamhead moco-core - 0.11.0 + 1.0.0 test From fe3d52e2a439d76025c26d258bd7f74472947548 Mon Sep 17 00:00:00 2001 From: Sutra Zhou Date: Sat, 11 Apr 2020 18:00:04 +0800 Subject: [PATCH 22/44] Add TLSv1.3 support. --- .../us/codecraft/webmagic/downloader/HttpClientGenerator.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java index 28a16f41..6409f568 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java @@ -50,7 +50,7 @@ public class HttpClientGenerator { private SSLConnectionSocketFactory buildSSLConnectionSocketFactory() { try { - return new SSLConnectionSocketFactory(createIgnoreVerifySSL(), new String[]{"SSLv3", "TLSv1", "TLSv1.1", "TLSv1.2"}, + return new SSLConnectionSocketFactory(createIgnoreVerifySSL(), new String[]{"SSLv3", "TLSv1", "TLSv1.1", "TLSv1.2", "TLSv1.3"}, null, new DefaultHostnameVerifier()); // 优先绕过安全证书 } catch (KeyManagementException e) { From b98a87e45a2cc51f75a386f3939b01679a5fd347 Mon Sep 17 00:00:00 2001 From: Sutra Zhou Date: Sat, 11 Apr 2020 20:21:20 +0800 Subject: [PATCH 23/44] Serialize requests in FileCacheQueueScheduler, so that the extra info of request could be restored. --- .../scheduler/FileCacheQueueScheduler.java | 24 +++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/FileCacheQueueScheduler.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/FileCacheQueueScheduler.java index 6ca98285..37310e6f 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/FileCacheQueueScheduler.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/FileCacheQueueScheduler.java @@ -1,6 +1,8 @@ package us.codecraft.webmagic.scheduler; +import org.apache.commons.codec.binary.Base64; import org.apache.commons.io.IOUtils; +import org.apache.commons.lang3.SerializationUtils; import org.apache.commons.lang3.math.NumberUtils; import us.codecraft.webmagic.Request; import us.codecraft.webmagic.Task; @@ -141,7 +143,7 @@ public class FileCacheQueueScheduler extends DuplicateRemovedScheduler implement urls.add(line.trim()); lineReaded++; if (lineReaded > cursor.get()) { - queue.add(new Request(line)); + queue.add(deserializeRequest(line)); } } } finally { @@ -183,7 +185,7 @@ public class FileCacheQueueScheduler extends DuplicateRemovedScheduler implement init(task); } queue.add(request); - fileUrlWriter.println(request.getUrl()); + fileUrlWriter.println(serializeRequest(request)); } @Override @@ -204,4 +206,22 @@ public class FileCacheQueueScheduler extends DuplicateRemovedScheduler implement public int getTotalRequestsCount(Task task) { return getDuplicateRemover().getTotalRequestsCount(task); } + + protected String serializeRequest(Request request) { + String line = String.format("%1$s\t%2$s", request.getUrl(), + Base64.encodeBase64String(SerializationUtils.serialize(request))); + return line; + } + + protected Request deserializeRequest(String line) { + Request request; + String[] sections = line.split("\t"); + if (sections.length >= 2) { + request = (Request) SerializationUtils.deserialize(Base64.decodeBase64(sections[1])); + } else { + request = new Request(sections[0]); + } + return request; + } + } From c46400d126998dbe043d1495c839d52409941c94 Mon Sep 17 00:00:00 2001 From: Sutra Zhou Date: Sun, 12 Apr 2020 01:30:57 +0800 Subject: [PATCH 24/44] Fix javadoc of sleep time. Fixes #918. --- webmagic-core/src/main/java/us/codecraft/webmagic/Site.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java index b6963ca4..72cc7d05 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java @@ -203,7 +203,7 @@ public class Site { /** * Set the interval between the processing of two pages.
- * Time unit is micro seconds.
+ * Time unit is milliseconds.
* * @param sleepTime sleepTime * @return this @@ -215,7 +215,7 @@ public class Site { /** * Get the interval between the processing of two pages.
- * Time unit is micro seconds.
+ * Time unit is milliseconds.
* * @return the interval between the processing of two pages, */ From e7476cb8dc67439159f7ffbf85d4e56f87810eea Mon Sep 17 00:00:00 2001 From: Sutra Zhou Date: Sun, 19 Apr 2020 22:44:06 +0800 Subject: [PATCH 25/44] Make Request#getExtra be generic. --- .../src/main/java/us/codecraft/webmagic/Request.java | 7 ++++--- .../src/main/java/us/codecraft/webmagic/ResultItems.java | 2 +- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java index eefd91bb..5c26d20d 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java @@ -78,14 +78,15 @@ public class Request implements Serializable { return this; } - public Object getExtra(String key) { + @SuppressWarnings("unchecked") + public T getExtra(String key) { if (extras == null) { return null; } - return extras.get(key); + return (T) extras.get(key); } - public Request putExtra(String key, Object value) { + public Request putExtra(String key, T value) { if (extras == null) { extras = new HashMap(); } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/ResultItems.java b/webmagic-core/src/main/java/us/codecraft/webmagic/ResultItems.java index 7b543613..488c81e7 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/ResultItems.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/ResultItems.java @@ -1,6 +1,5 @@ package us.codecraft.webmagic; -import java.util.HashMap; import java.util.LinkedHashMap; import java.util.Map; @@ -21,6 +20,7 @@ public class ResultItems { private boolean skip; + @SuppressWarnings("unchecked") public T get(String key) { Object o = fields.get(key); if (o == null) { From 30667f468705c61d78a91288046ca317dd9f94a8 Mon Sep 17 00:00:00 2001 From: Sutra Zhou Date: Thu, 21 May 2020 18:51:37 +0800 Subject: [PATCH 26/44] Remove oss-parent setting as it is no longer active. --- pom.xml | 5 ----- 1 file changed, 5 deletions(-) diff --git a/pom.xml b/pom.xml index 2b2384fd..b4437708 100644 --- a/pom.xml +++ b/pom.xml @@ -1,10 +1,5 @@ - - org.sonatype.oss - oss-parent - 7 - us.codecraft 0.7.3 4.0.0 From ba1b4017a7399d7fa073b309d5376e5f3463214d Mon Sep 17 00:00:00 2001 From: Sutra Zhou Date: Thu, 21 May 2020 19:59:29 +0800 Subject: [PATCH 27/44] Mark slf4j-log4j12 as optional. --- pom.xml | 2 +- webmagic-core/pom.xml | 1 + .../codecraft/webmagic/example/PatternProcessorExample.java | 6 ++++-- .../java/us/codecraft/webmagic/selector/Xpath2Selector.java | 5 +++-- webmagic-scripts/pom.xml | 4 ++++ .../webmagic/downloader/selenium/SeleniumDownloader.java | 6 ++++-- .../webmagic/downloader/selenium/WebDriverPool.java | 5 +++-- 7 files changed, 20 insertions(+), 9 deletions(-) diff --git a/pom.xml b/pom.xml index b4437708..161d62f8 100644 --- a/pom.xml +++ b/pom.xml @@ -34,7 +34,7 @@ scm:git:git@github.com:code4craft/webmagic.git git@github.com:code4craft/webmagic.git webmagic-parent-0.6.1 - + Apache License, Version 2.0 diff --git a/webmagic-core/pom.xml b/webmagic-core/pom.xml index e889cd49..66e455d3 100644 --- a/webmagic-core/pom.xml +++ b/webmagic-core/pom.xml @@ -48,6 +48,7 @@ org.slf4j slf4j-log4j12 + true diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/example/PatternProcessorExample.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/example/PatternProcessorExample.java index 8ecb08fe..9406abfd 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/example/PatternProcessorExample.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/example/PatternProcessorExample.java @@ -1,6 +1,8 @@ package us.codecraft.webmagic.example; -import org.apache.log4j.Logger; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + import us.codecraft.webmagic.*; import us.codecraft.webmagic.handler.CompositePageProcessor; import us.codecraft.webmagic.handler.CompositePipeline; @@ -15,7 +17,7 @@ import us.codecraft.webmagic.handler.RequestMatcher; */ public class PatternProcessorExample { - private static Logger log = Logger.getLogger(PatternProcessorExample.class); + private static Logger log = LoggerFactory.getLogger(PatternProcessorExample.class); public static void main(String... args) { diff --git a/webmagic-saxon/src/main/java/us/codecraft/webmagic/selector/Xpath2Selector.java b/webmagic-saxon/src/main/java/us/codecraft/webmagic/selector/Xpath2Selector.java index 98b1efe4..d8aab6cc 100644 --- a/webmagic-saxon/src/main/java/us/codecraft/webmagic/selector/Xpath2Selector.java +++ b/webmagic-saxon/src/main/java/us/codecraft/webmagic/selector/Xpath2Selector.java @@ -2,11 +2,12 @@ package us.codecraft.webmagic.selector; import net.sf.saxon.lib.NamespaceConstant; import net.sf.saxon.xpath.XPathEvaluator; -import org.apache.log4j.Logger; import org.htmlcleaner.CleanerProperties; import org.htmlcleaner.DomSerializer; import org.htmlcleaner.HtmlCleaner; import org.htmlcleaner.TagNode; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import org.w3c.dom.Document; import org.w3c.dom.Node; import org.w3c.dom.NodeList; @@ -40,7 +41,7 @@ public class Xpath2Selector implements Selector { private XPathExpression xPathExpression; - private Logger logger = Logger.getLogger(getClass()); + private Logger logger = LoggerFactory.getLogger(getClass()); public Xpath2Selector(String xpathStr) { this.xpathStr = xpathStr; diff --git a/webmagic-scripts/pom.xml b/webmagic-scripts/pom.xml index 22956cb5..9dbc7b39 100755 --- a/webmagic-scripts/pom.xml +++ b/webmagic-scripts/pom.xml @@ -49,6 +49,10 @@ webmagic-core ${project.version} + + org.slf4j + slf4j-log4j12 + us.codecraft webmagic-extension diff --git a/webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/SeleniumDownloader.java b/webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/SeleniumDownloader.java index f45f7e2a..cce293fc 100644 --- a/webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/SeleniumDownloader.java +++ b/webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/SeleniumDownloader.java @@ -1,10 +1,12 @@ package us.codecraft.webmagic.downloader.selenium; -import org.apache.log4j.Logger; import org.openqa.selenium.By; import org.openqa.selenium.Cookie; import org.openqa.selenium.WebDriver; import org.openqa.selenium.WebElement; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Request; import us.codecraft.webmagic.Site; @@ -29,7 +31,7 @@ public class SeleniumDownloader implements Downloader, Closeable { private volatile WebDriverPool webDriverPool; - private Logger logger = Logger.getLogger(getClass()); + private Logger logger = LoggerFactory.getLogger(getClass()); private int sleepTime = 0; diff --git a/webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/WebDriverPool.java b/webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/WebDriverPool.java index 1472cb32..e1d9dd03 100644 --- a/webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/WebDriverPool.java +++ b/webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/WebDriverPool.java @@ -1,6 +1,5 @@ package us.codecraft.webmagic.downloader.selenium; -import org.apache.log4j.Logger; import org.openqa.selenium.WebDriver; import org.openqa.selenium.chrome.ChromeDriver; import org.openqa.selenium.firefox.FirefoxDriver; @@ -8,6 +7,8 @@ import org.openqa.selenium.phantomjs.PhantomJSDriver; import org.openqa.selenium.phantomjs.PhantomJSDriverService; import org.openqa.selenium.remote.DesiredCapabilities; import org.openqa.selenium.remote.RemoteWebDriver; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import java.io.FileReader; import java.io.IOException; @@ -27,7 +28,7 @@ import java.util.concurrent.atomic.AtomicInteger; * Time: 下午1:41
*/ class WebDriverPool { - private Logger logger = Logger.getLogger(getClass()); + private Logger logger = LoggerFactory.getLogger(getClass()); private final static int DEFAULT_CAPACITY = 5; From 436af973465797f3a2e865cce4bf6d1b0701362e Mon Sep 17 00:00:00 2001 From: Sutra Zhou Date: Thu, 28 May 2020 13:59:05 +0800 Subject: [PATCH 28/44] Use spaces as indent. --- .../downloader/HttpClientGenerator.java | 60 +++++++++---------- 1 file changed, 30 insertions(+), 30 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java index 6409f568..9c389165 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java @@ -34,9 +34,9 @@ import java.util.Map; * @since 0.4.0 */ public class HttpClientGenerator { - + private transient Logger logger = LoggerFactory.getLogger(getClass()); - + private PoolingHttpClientConnectionManager connectionManager; public HttpClientGenerator() { @@ -48,43 +48,43 @@ public class HttpClientGenerator { connectionManager.setDefaultMaxPerRoute(100); } - private SSLConnectionSocketFactory buildSSLConnectionSocketFactory() { - try { + private SSLConnectionSocketFactory buildSSLConnectionSocketFactory() { + try { return new SSLConnectionSocketFactory(createIgnoreVerifySSL(), new String[]{"SSLv3", "TLSv1", "TLSv1.1", "TLSv1.2", "TLSv1.3"}, null, new DefaultHostnameVerifier()); // 优先绕过安全证书 - } catch (KeyManagementException e) { + } catch (KeyManagementException e) { logger.error("ssl connection fail", e); } catch (NoSuchAlgorithmException e) { logger.error("ssl connection fail", e); } - return SSLConnectionSocketFactory.getSocketFactory(); + return SSLConnectionSocketFactory.getSocketFactory(); } - private SSLContext createIgnoreVerifySSL() throws NoSuchAlgorithmException, KeyManagementException { - // 实现一个X509TrustManager接口,用于绕过验证,不用修改里面的方法 - X509TrustManager trustManager = new X509TrustManager() { - - @Override - public void checkClientTrusted(X509Certificate[] chain, String authType) throws CertificateException { - } - - @Override - public void checkServerTrusted(X509Certificate[] chain, String authType) throws CertificateException { - } - - @Override - public X509Certificate[] getAcceptedIssuers() { - return null; - } - - }; - - SSLContext sc = SSLContext.getInstance("SSLv3"); - sc.init(null, new TrustManager[] { trustManager }, null); - return sc; + private SSLContext createIgnoreVerifySSL() throws NoSuchAlgorithmException, KeyManagementException { + // 实现一个X509TrustManager接口,用于绕过验证,不用修改里面的方法 + X509TrustManager trustManager = new X509TrustManager() { + + @Override + public void checkClientTrusted(X509Certificate[] chain, String authType) throws CertificateException { + } + + @Override + public void checkServerTrusted(X509Certificate[] chain, String authType) throws CertificateException { + } + + @Override + public X509Certificate[] getAcceptedIssuers() { + return null; + } + + }; + + SSLContext sc = SSLContext.getInstance("SSLv3"); + sc.init(null, new TrustManager[] { trustManager }, null); + return sc; } - + public HttpClientGenerator setPoolSize(int poolSize) { connectionManager.setMaxTotal(poolSize); return this; @@ -96,7 +96,7 @@ public class HttpClientGenerator { private CloseableHttpClient generateClient(Site site) { HttpClientBuilder httpClientBuilder = HttpClients.custom(); - + httpClientBuilder.setConnectionManager(connectionManager); if (site.getUserAgent() != null) { httpClientBuilder.setUserAgent(site.getUserAgent()); From 3e425231414d41bd2d7ce1500d3e36fb610754fb Mon Sep 17 00:00:00 2001 From: Sutra Zhou Date: Thu, 28 May 2020 14:14:16 +0800 Subject: [PATCH 29/44] TLSv1.3 requires Java 11 at least. Fixes #927. --- pom.xml | 2 +- .../downloader/HttpClientGenerator.java | 40 +++++++++++++------ 2 files changed, 29 insertions(+), 13 deletions(-) diff --git a/pom.xml b/pom.xml index 161d62f8..eca25234 100644 --- a/pom.xml +++ b/pom.xml @@ -131,7 +131,7 @@ org.apache.commons commons-lang3 - 3.1 + 3.10 commons-collections diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java index 9c389165..d932de94 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java @@ -1,5 +1,18 @@ package us.codecraft.webmagic.downloader; +import java.io.IOException; +import java.security.KeyManagementException; +import java.security.NoSuchAlgorithmException; +import java.security.cert.CertificateException; +import java.security.cert.X509Certificate; +import java.util.Map; + +import javax.net.ssl.SSLContext; +import javax.net.ssl.TrustManager; +import javax.net.ssl.X509TrustManager; + +import org.apache.commons.lang3.JavaVersion; +import org.apache.commons.lang3.SystemUtils; import org.apache.http.HttpException; import org.apache.http.HttpRequest; import org.apache.http.HttpRequestInterceptor; @@ -11,23 +24,18 @@ import org.apache.http.conn.socket.ConnectionSocketFactory; import org.apache.http.conn.socket.PlainConnectionSocketFactory; import org.apache.http.conn.ssl.DefaultHostnameVerifier; import org.apache.http.conn.ssl.SSLConnectionSocketFactory; -import org.apache.http.impl.client.*; +import org.apache.http.impl.client.BasicCookieStore; +import org.apache.http.impl.client.CloseableHttpClient; +import org.apache.http.impl.client.DefaultHttpRequestRetryHandler; +import org.apache.http.impl.client.HttpClientBuilder; +import org.apache.http.impl.client.HttpClients; import org.apache.http.impl.conn.PoolingHttpClientConnectionManager; import org.apache.http.impl.cookie.BasicClientCookie; import org.apache.http.protocol.HttpContext; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import us.codecraft.webmagic.Site; -import javax.net.ssl.SSLContext; -import javax.net.ssl.TrustManager; -import javax.net.ssl.X509TrustManager; -import java.io.IOException; -import java.security.KeyManagementException; -import java.security.NoSuchAlgorithmException; -import java.security.cert.CertificateException; -import java.security.cert.X509Certificate; -import java.util.Map; +import us.codecraft.webmagic.Site; /** * @author code4crafter@gmail.com
@@ -50,7 +58,15 @@ public class HttpClientGenerator { private SSLConnectionSocketFactory buildSSLConnectionSocketFactory() { try { - return new SSLConnectionSocketFactory(createIgnoreVerifySSL(), new String[]{"SSLv3", "TLSv1", "TLSv1.1", "TLSv1.2", "TLSv1.3"}, + SSLContext sslContext = createIgnoreVerifySSL(); + String[] supportedProtocols; + if (SystemUtils.isJavaVersionAtLeast(JavaVersion.JAVA_11)) { + supportedProtocols = new String[] { "SSLv3", "TLSv1", "TLSv1.1", "TLSv1.2", "TLSv1.3" }; + } else { + supportedProtocols = new String[] { "SSLv3", "TLSv1", "TLSv1.1", "TLSv1.2" }; + } + logger.info("supportedProtocols: {}", String.join(", ", supportedProtocols)); + return new SSLConnectionSocketFactory(sslContext, supportedProtocols, null, new DefaultHostnameVerifier()); // 优先绕过安全证书 } catch (KeyManagementException e) { From 3510e74d3f024a30aaf7355be7eb3b035c53fc3e Mon Sep 17 00:00:00 2001 From: Sutra Zhou Date: Thu, 28 May 2020 14:18:53 +0800 Subject: [PATCH 30/44] Travis supports openjdk relase number is 9 to 15 now. --- .travis.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index 9e6f78d3..8f79da0c 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,3 +1,3 @@ language: java jdk: - - openjdk7 + - openjdk9 From 4078766d0e0edd510ff5f7071772e9de96420ee3 Mon Sep 17 00:00:00 2001 From: Sutra Zhou Date: Thu, 28 May 2020 14:30:14 +0800 Subject: [PATCH 31/44] Change log level of supportedProtocols. --- .../us/codecraft/webmagic/downloader/HttpClientGenerator.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java index d932de94..ee94581a 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java @@ -65,7 +65,7 @@ public class HttpClientGenerator { } else { supportedProtocols = new String[] { "SSLv3", "TLSv1", "TLSv1.1", "TLSv1.2" }; } - logger.info("supportedProtocols: {}", String.join(", ", supportedProtocols)); + logger.debug("supportedProtocols: {}", String.join(", ", supportedProtocols)); return new SSLConnectionSocketFactory(sslContext, supportedProtocols, null, new DefaultHostnameVerifier()); // 优先绕过安全证书 From ab5ac9d7969ec73354700670741e02098b23a597 Mon Sep 17 00:00:00 2001 From: Sutra Zhou Date: Thu, 28 May 2020 19:05:17 +0800 Subject: [PATCH 32/44] Fix test failure and javadoc failure. --- pom.xml | 19 +++++++++++-------- .../webmagic/selector/RegexSelector.java | 2 +- webmagic-scripts/pom.xml | 16 ---------------- 3 files changed, 12 insertions(+), 25 deletions(-) diff --git a/pom.xml b/pom.xml index eca25234..06a32130 100644 --- a/pom.xml +++ b/pom.xml @@ -7,8 +7,8 @@ UTF-8 UTF-8 + 1.8 4.0.0.RELEASE - webmagic-parent webmagic-parent @@ -103,7 +103,7 @@ com.alibaba fastjson - 1.2.28 + 1.2.56 com.github.dreamhead @@ -162,7 +162,7 @@ org.apache.maven.plugins maven-surefire-plugin - 2.18 + 3.0.0-M4 0 @@ -170,11 +170,10 @@ org.apache.maven.plugins maven-compiler-plugin - 3.1 + 3.7.0 - 1.6 - 1.6 - UTF-8 + ${java.version} + ${java.version} @@ -230,11 +229,15 @@ org.apache.maven.plugins maven-javadoc-plugin - 2.10.4 + 3.2.0 UTF-8 WebMagic 0.7.3 en_US + + + false + diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/RegexSelector.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/RegexSelector.java index 9ae538c0..fb0a161d 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/RegexSelector.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/RegexSelector.java @@ -41,7 +41,7 @@ public class RegexSelector implements Selector { /** * Create a RegexSelector. When there is no capture group, the value is set to 0 else set to 1. - * @param regexStr + * @param regexStr the regular expression. */ public RegexSelector(String regexStr) { this.compileRegex(regexStr); diff --git a/webmagic-scripts/pom.xml b/webmagic-scripts/pom.xml index 9dbc7b39..94f08f02 100755 --- a/webmagic-scripts/pom.xml +++ b/webmagic-scripts/pom.xml @@ -7,7 +7,6 @@ 4.0.0 - us.codecraft webmagic-scripts 1.1.2-2 @@ -63,21 +62,6 @@ ${project.basedir}/src/main/java - - maven-compiler-plugin - - 1.6 - 1.6 - UTF-8 - - - - org.apache.maven.plugins - maven-resources-plugin - - UTF-8 - - org.apache.maven.plugins maven-jar-plugin From fe9dca12477ca86588de79522aa3ed1a9e26f9dd Mon Sep 17 00:00:00 2001 From: Sutra Zhou Date: Fri, 29 May 2020 14:28:34 +0800 Subject: [PATCH 33/44] Upgrade guava from 15.0 to 29.0. --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index aefa8b07..d6e226ee 100644 --- a/pom.xml +++ b/pom.xml @@ -78,7 +78,7 @@ com.google.guava guava - 15.0 + 29.0-jre com.jayway.jsonpath From 98281ab26e67f4309a4a0b7b4b41ca1a66e74de7 Mon Sep 17 00:00:00 2001 From: Sutra Zhou Date: Fri, 29 May 2020 14:43:59 +0800 Subject: [PATCH 34/44] Upgrade httpclient from 4.5.2 to 4.5.12. --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index d6e226ee..87c47ff5 100644 --- a/pom.xml +++ b/pom.xml @@ -68,7 +68,7 @@ org.apache.httpcomponents httpclient - 4.5.2 + 4.5.12 org.apache.httpcomponents From a7c4e701e417ba54d8df25912b48c69f290dfdab Mon Sep 17 00:00:00 2001 From: Sutra Zhou Date: Fri, 29 May 2020 18:12:05 +0800 Subject: [PATCH 35/44] Specify the required minimum maven version. --- pom.xml | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/pom.xml b/pom.xml index 87c47ff5..ee086c85 100644 --- a/pom.xml +++ b/pom.xml @@ -159,6 +159,26 @@ + + org.apache.maven.plugins + maven-enforcer-plugin + 3.0.0-M3 + + + enforce-maven + + enforce + + + + + 3.0.5 + + + + + + org.apache.maven.plugins maven-surefire-plugin From 6c05dd8b725982eb78d8f0042c2ca2aa04f70cf8 Mon Sep 17 00:00:00 2001 From: Sutra Zhou Date: Fri, 29 May 2020 18:19:45 +0800 Subject: [PATCH 36/44] Upgrade maven plugins. --- pom.xml | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/pom.xml b/pom.xml index ee086c85..2186b742 100644 --- a/pom.xml +++ b/pom.xml @@ -190,7 +190,7 @@ org.apache.maven.plugins maven-compiler-plugin - 3.7.0 + 3.8.1 ${java.version} ${java.version} @@ -219,10 +219,7 @@ org.apache.maven.plugins maven-resources-plugin - 2.6 - - UTF-8 - + 3.1.0 org.apache.maven.plugins @@ -236,7 +233,7 @@ org.apache.maven.plugins maven-source-plugin - 2.2.1 + 3.2.1 attach-sources @@ -278,7 +275,7 @@ org.apache.maven.plugins maven-release-plugin - 2.4.1 + 3.0.0-M1 @@ -333,7 +330,7 @@ org.sonatype.plugins nexus-staging-maven-plugin - 1.6 + 1.6.8 true sonatype-nexus-staging From 71aa04c89f27f6a74ca981f19b3f9cc38a7c29d5 Mon Sep 17 00:00:00 2001 From: Sutra Zhou Date: Sat, 30 May 2020 02:10:01 +0800 Subject: [PATCH 37/44] Upgrade dependencies. --- pom.xml | 61 +++++++++++++++++++++++++++++++------- webmagic-core/pom.xml | 2 +- webmagic-extension/pom.xml | 6 ++-- webmagic-samples/pom.xml | 4 +-- webmagic-saxon/pom.xml | 4 +-- webmagic-scripts/pom.xml | 12 +++----- webmagic-selenium/pom.xml | 7 +---- 7 files changed, 61 insertions(+), 35 deletions(-) diff --git a/pom.xml b/pom.xml index 2186b742..1a5853ad 100644 --- a/pom.xml +++ b/pom.xml @@ -56,7 +56,7 @@ junit junit - 4.11 + 4.13 test @@ -73,7 +73,7 @@ org.apache.httpcomponents httpcore - 4.4.4 + 4.4.13 com.google.guava @@ -88,12 +88,12 @@ org.slf4j slf4j-api - 1.7.6 + 1.7.30 org.slf4j slf4j-log4j12 - 1.7.6 + 1.7.30 us.codecraft @@ -103,12 +103,12 @@ com.alibaba fastjson - 1.2.56 + 1.2.68 com.github.dreamhead moco-core - 1.0.0 + 1.1.0 test @@ -125,7 +125,7 @@ org.assertj assertj-core - 1.5.0 + 3.16.1 test @@ -143,16 +143,55 @@ commons-io 1.3.2 + + org.codehaus.groovy + groovy-all + 2.4.19 + + + org.jruby + jruby + 9.2.11.1 + org.jsoup jsoup 1.10.3 - org.mockito - mockito-all - 1.9.5 - test + org.python + jython + 2.7.2 + + + org.seleniumhq.selenium + selenium-java + 3.141.59 + + + net.sf.saxon + Saxon-HE + 10.1 + + + net.sourceforge.htmlcleaner + htmlcleaner + 2.5 + + + com.github.detro + phantomjsdriver + 1.2.0 + + + commons-cli + commons-cli + 1.4 + + + redis.clients + jedis + 2.9.3 diff --git a/webmagic-core/pom.xml b/webmagic-core/pom.xml index 66e455d3..4bc074da 100644 --- a/webmagic-core/pom.xml +++ b/webmagic-core/pom.xml @@ -83,4 +83,4 @@ -
\ No newline at end of file + diff --git a/webmagic-extension/pom.xml b/webmagic-extension/pom.xml index 7e949ca6..bf7ff05d 100644 --- a/webmagic-extension/pom.xml +++ b/webmagic-extension/pom.xml @@ -13,16 +13,14 @@ redis.clients jedis - 2.9.0 com.google.guava guava - 15.0 true - us.codecraft + ${project.groupId} webmagic-core ${project.version} @@ -32,4 +30,4 @@
- \ No newline at end of file + diff --git a/webmagic-samples/pom.xml b/webmagic-samples/pom.xml index 072bb3fd..44fee7c0 100644 --- a/webmagic-samples/pom.xml +++ b/webmagic-samples/pom.xml @@ -11,12 +11,12 @@ - us.codecraft + ${project.groupId} webmagic-core ${project.version} - us.codecraft + ${project.groupId} webmagic-extension ${project.version} diff --git a/webmagic-saxon/pom.xml b/webmagic-saxon/pom.xml index 95f706ed..ae1454b5 100644 --- a/webmagic-saxon/pom.xml +++ b/webmagic-saxon/pom.xml @@ -11,19 +11,17 @@ - us.codecraft + ${project.groupId} webmagic-core ${project.version} net.sourceforge.htmlcleaner htmlcleaner - 2.5 net.sf.saxon Saxon-HE - 9.5.1-1 junit diff --git a/webmagic-scripts/pom.xml b/webmagic-scripts/pom.xml index 94f08f02..9f4219d6 100755 --- a/webmagic-scripts/pom.xml +++ b/webmagic-scripts/pom.xml @@ -16,27 +16,23 @@ org.jruby jruby - 1.7.6 org.jetbrains.kotlin kotlin-stdlib ${kotlin.version} - org.codehaus.groovy groovy-all - 2.1.6 - org.python + + org.python jython - 2.5.3 commons-cli commons-cli - 1.2 junit @@ -44,7 +40,7 @@ test - us.codecraft + ${project.groupId} webmagic-core ${project.version} @@ -53,7 +49,7 @@ slf4j-log4j12 - us.codecraft + ${project.groupId} webmagic-extension ${project.version} diff --git a/webmagic-selenium/pom.xml b/webmagic-selenium/pom.xml index e88cce5c..b5d09695 100644 --- a/webmagic-selenium/pom.xml +++ b/webmagic-selenium/pom.xml @@ -13,21 +13,16 @@ org.seleniumhq.selenium selenium-java - 3.0.0 - us.codecraft + ${project.groupId} webmagic-core ${project.version} com.github.detro phantomjsdriver - 1.2.0 - - - junit junit From 5d14efc50f5b81281819036c65bd7e81b04e10b0 Mon Sep 17 00:00:00 2001 From: Sutra Zhou Date: Sun, 14 Jun 2020 00:20:39 +0800 Subject: [PATCH 38/44] Serialize request URL only in FileCacheQueueScheduler. --- .../scheduler/FileCacheQueueScheduler.java | 37 +++++++++---------- 1 file changed, 17 insertions(+), 20 deletions(-) diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/FileCacheQueueScheduler.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/FileCacheQueueScheduler.java index 37310e6f..fec3c1db 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/FileCacheQueueScheduler.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/FileCacheQueueScheduler.java @@ -1,14 +1,13 @@ package us.codecraft.webmagic.scheduler; -import org.apache.commons.codec.binary.Base64; -import org.apache.commons.io.IOUtils; -import org.apache.commons.lang3.SerializationUtils; -import org.apache.commons.lang3.math.NumberUtils; -import us.codecraft.webmagic.Request; -import us.codecraft.webmagic.Task; -import us.codecraft.webmagic.scheduler.component.DuplicateRemover; - -import java.io.*; +import java.io.BufferedReader; +import java.io.Closeable; +import java.io.File; +import java.io.FileNotFoundException; +import java.io.FileReader; +import java.io.FileWriter; +import java.io.IOException; +import java.io.PrintWriter; import java.util.LinkedHashSet; import java.util.Set; import java.util.concurrent.BlockingQueue; @@ -19,6 +18,13 @@ import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicBoolean; import java.util.concurrent.atomic.AtomicInteger; +import org.apache.commons.io.IOUtils; +import org.apache.commons.lang3.math.NumberUtils; + +import us.codecraft.webmagic.Request; +import us.codecraft.webmagic.Task; +import us.codecraft.webmagic.scheduler.component.DuplicateRemover; + /** * Store urls and cursor in files so that a Spider can resume the status when shutdown.
@@ -208,20 +214,11 @@ public class FileCacheQueueScheduler extends DuplicateRemovedScheduler implement } protected String serializeRequest(Request request) { - String line = String.format("%1$s\t%2$s", request.getUrl(), - Base64.encodeBase64String(SerializationUtils.serialize(request))); - return line; + return request.getUrl(); } protected Request deserializeRequest(String line) { - Request request; - String[] sections = line.split("\t"); - if (sections.length >= 2) { - request = (Request) SerializationUtils.deserialize(Base64.decodeBase64(sections[1])); - } else { - request = new Request(sections[0]); - } - return request; + return new Request(line); } } From 2413366adb6df0f27b5f806f0228d5a41fb90935 Mon Sep 17 00:00:00 2001 From: Sutra Zhou Date: Mon, 15 Jun 2020 20:01:14 +0800 Subject: [PATCH 39/44] Format code, no actual code changed. --- .../us/codecraft/webmagic/proxy/Proxy.java | 136 +++++++++--------- .../webmagic/proxy/ProxyProvider.java | 2 +- .../webmagic/proxy/SimpleProxyProvider.java | 1 + 3 files changed, 70 insertions(+), 69 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/Proxy.java b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/Proxy.java index c5f10073..4b49557b 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/Proxy.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/Proxy.java @@ -1,73 +1,73 @@ package us.codecraft.webmagic.proxy; -/** - * - */ - public class Proxy { - private String host; - private int port; - private String username; - private String password; - - public Proxy(String host, int port) { - this.host = host; - this.port = port; - } - - public Proxy(String host, int port, String username, String password) { - this.host = host; - this.port = port; - this.username = username; - this.password = password; - } - - public String getHost() { - return host; - } - - public int getPort() { - return port; - } - - public String getUsername() { - return username; - } - - public String getPassword() { - return password; - } - - @Override - public boolean equals(Object o) { - if (this == o) return true; - if (o == null || getClass() != o.getClass()) return false; - - Proxy proxy = (Proxy) o; - - if (port != proxy.port) return false; - if (host != null ? !host.equals(proxy.host) : proxy.host != null) return false; - if (username != null ? !username.equals(proxy.username) : proxy.username != null) return false; - return password != null ? password.equals(proxy.password) : proxy.password == null; - } - - @Override - public int hashCode() { - int result = host != null ? host.hashCode() : 0; - result = 31 * result + port; - result = 31 * result + (username != null ? username.hashCode() : 0); - result = 31 * result + (password != null ? password.hashCode() : 0); - return result; - } - - @Override - public String toString() { - return "Proxy{" + - "host='" + host + '\'' + - ", port=" + port + - ", username='" + username + '\'' + - ", password='" + password + '\'' + - '}'; - } + private String host; + + private int port; + + private String username; + + private String password; + + public Proxy(String host, int port) { + this.host = host; + this.port = port; + } + + public Proxy(String host, int port, String username, String password) { + this.host = host; + this.port = port; + this.username = username; + this.password = password; + } + + public String getHost() { + return host; + } + + public int getPort() { + return port; + } + + public String getUsername() { + return username; + } + + public String getPassword() { + return password; + } + + @Override + public boolean equals(Object o) { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + + Proxy proxy = (Proxy) o; + + if (port != proxy.port) return false; + if (host != null ? !host.equals(proxy.host) : proxy.host != null) return false; + if (username != null ? !username.equals(proxy.username) : proxy.username != null) return false; + return password != null ? password.equals(proxy.password) : proxy.password == null; + } + + @Override + public int hashCode() { + int result = host != null ? host.hashCode() : 0; + result = 31 * result + port; + result = 31 * result + (username != null ? username.hashCode() : 0); + result = 31 * result + (password != null ? password.hashCode() : 0); + return result; + } + + @Override + public String toString() { + return "Proxy{" + + "host='" + host + '\'' + + ", port=" + port + + ", username='" + username + '\'' + + ", password='" + password + '\'' + + '}'; + } + } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ProxyProvider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ProxyProvider.java index 5b61a993..0cef4ed4 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ProxyProvider.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ProxyProvider.java @@ -25,5 +25,5 @@ public interface ProxyProvider { * @return proxy */ Proxy getProxy(Task task); - + } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/SimpleProxyProvider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/SimpleProxyProvider.java index d8f47fe4..ddef6a88 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/SimpleProxyProvider.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/SimpleProxyProvider.java @@ -59,4 +59,5 @@ public class SimpleProxyProvider implements ProxyProvider { } return p % size; } + } From 791323a5b0730f483a5a488dff149995f6722c75 Mon Sep 17 00:00:00 2001 From: Sutra Zhou Date: Tue, 16 Jun 2020 14:45:29 +0800 Subject: [PATCH 40/44] Add Proxy#scheme. --- .../downloader/HttpUriRequestConverter.java | 2 +- .../us/codecraft/webmagic/proxy/Proxy.java | 27 ++++++++++++++----- 2 files changed, 21 insertions(+), 8 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpUriRequestConverter.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpUriRequestConverter.java index 28a7ce5e..4baaf4a4 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpUriRequestConverter.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpUriRequestConverter.java @@ -74,7 +74,7 @@ public class HttpUriRequestConverter { } if (proxy != null) { - requestConfigBuilder.setProxy(new HttpHost(proxy.getHost(), proxy.getPort())); + requestConfigBuilder.setProxy(new HttpHost(proxy.getHost(), proxy.getPort(), proxy.getScheme())); } requestBuilder.setConfig(requestConfigBuilder.build()); HttpUriRequest httpUriRequest = requestBuilder.build(); diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/Proxy.java b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/Proxy.java index 4b49557b..fe3f78d9 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/Proxy.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/Proxy.java @@ -2,6 +2,8 @@ package us.codecraft.webmagic.proxy; public class Proxy { + private String scheme; + private String host; private int port; @@ -11,8 +13,13 @@ public class Proxy { private String password; public Proxy(String host, int port) { + this(host, port, null); + } + + public Proxy(String host, int port, String scheme) { this.host = host; this.port = port; + this.scheme = scheme; } public Proxy(String host, int port, String username, String password) { @@ -22,7 +29,15 @@ public class Proxy { this.password = password; } - public String getHost() { + public String getScheme() { + return scheme; + } + + public void setScheme(String scheme) { + this.scheme = scheme; + } + + public String getHost() { return host; } @@ -47,6 +62,7 @@ public class Proxy { if (port != proxy.port) return false; if (host != null ? !host.equals(proxy.host) : proxy.host != null) return false; + if (scheme != null ? !scheme.equals(proxy.scheme) : proxy.scheme != null) return false; if (username != null ? !username.equals(proxy.username) : proxy.username != null) return false; return password != null ? password.equals(proxy.password) : proxy.password == null; } @@ -55,6 +71,7 @@ public class Proxy { public int hashCode() { int result = host != null ? host.hashCode() : 0; result = 31 * result + port; + result = 31 * result + (scheme != null ? scheme.hashCode() : 0); result = 31 * result + (username != null ? username.hashCode() : 0); result = 31 * result + (password != null ? password.hashCode() : 0); return result; @@ -62,12 +79,8 @@ public class Proxy { @Override public String toString() { - return "Proxy{" + - "host='" + host + '\'' + - ", port=" + port + - ", username='" + username + '\'' + - ", password='" + password + '\'' + - '}'; + return String.format("Proxy{scheme='%1$s', host='%2$s', port=%3$d, username='%4$s', password='%5$s'}", + scheme, host, port, username, password); } } From 236e5ade44b24ac7db2e7821444db923a5f5da33 Mon Sep 17 00:00:00 2001 From: Sutra Zhou Date: Wed, 17 Jun 2020 11:19:37 +0800 Subject: [PATCH 41/44] Update Proxy#toString(). --- .../us/codecraft/webmagic/proxy/Proxy.java | 37 ++++++++++++++++++- .../codecraft/webmagic/proxy/ProxyTest.java | 16 +++++++- 2 files changed, 49 insertions(+), 4 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/Proxy.java b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/Proxy.java index fe3f78d9..179761cc 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/Proxy.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/Proxy.java @@ -1,5 +1,13 @@ package us.codecraft.webmagic.proxy; +import java.io.UnsupportedEncodingException; +import java.net.URI; +import java.net.URISyntaxException; +import java.net.URLEncoder; +import java.nio.charset.StandardCharsets; + +import org.apache.commons.lang3.StringUtils; + public class Proxy { private String scheme; @@ -53,6 +61,28 @@ public class Proxy { return password; } + public URI toURI() throws URISyntaxException { + final StringBuilder userInfoBuffer = new StringBuilder(); + if (username != null) { + userInfoBuffer.append(urlencode(username)); + } + if (password != null) { + userInfoBuffer.append(":").append(urlencode(password)); + } + final String userInfo = StringUtils.defaultIfEmpty(userInfoBuffer.toString(), null); + final URI uri = new URI(scheme, userInfo, host, port, null, null, null); + return uri; + } + + private String urlencode(String s) { + String enc = StandardCharsets.UTF_8.name(); + try { + return URLEncoder.encode(s, enc); + } catch (UnsupportedEncodingException e) { + throw new IllegalArgumentException(e); + } + } + @Override public boolean equals(Object o) { if (this == o) return true; @@ -79,8 +109,11 @@ public class Proxy { @Override public String toString() { - return String.format("Proxy{scheme='%1$s', host='%2$s', port=%3$d, username='%4$s', password='%5$s'}", - scheme, host, port, username, password); + try { + return this.toURI().toString(); + } catch (URISyntaxException e) { + throw new IllegalArgumentException(e); + } } } diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/proxy/ProxyTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/proxy/ProxyTest.java index 86af3672..89467013 100644 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/proxy/ProxyTest.java +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/proxy/ProxyTest.java @@ -1,11 +1,14 @@ package us.codecraft.webmagic.proxy; -import org.apache.http.HttpHost; -import org.junit.BeforeClass; +import static org.junit.Assert.assertEquals; import java.util.ArrayList; import java.util.List; +import org.apache.http.HttpHost; +import org.junit.BeforeClass; +import org.junit.Test; + /** * @author yxssfxwzy@sina.com May 30, 2014 * @@ -42,4 +45,13 @@ public class ProxyTest { } } + @Test + public void testToString() { + assertEquals("//127.0.0.1:8080", new Proxy("127.0.0.1", 8080).toString()); + assertEquals("http://127.0.0.1:8080", new Proxy("127.0.0.1", 8080, "http").toString()); + assertEquals("//username:password@127.0.0.1:8080", new Proxy("127.0.0.1", 8080, "username", "password").toString()); + assertEquals("//username@127.0.0.1:8080", new Proxy("127.0.0.1", 8080, "username", null).toString()); + assertEquals("//:password@127.0.0.1:8080", new Proxy("127.0.0.1", 8080, null, "password").toString()); + } + } From 6d3f2d9b641b2c99f5b5b244d7ed86e4ee23ca13 Mon Sep 17 00:00:00 2001 From: Sutra Zhou Date: Wed, 24 Jun 2020 13:24:45 +0800 Subject: [PATCH 42/44] Wrap URISyntaxException as IllegalArgumentException for Proxy#toURI. --- .../java/us/codecraft/webmagic/proxy/Proxy.java | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/Proxy.java b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/Proxy.java index 179761cc..dffadba8 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/Proxy.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/Proxy.java @@ -61,7 +61,7 @@ public class Proxy { return password; } - public URI toURI() throws URISyntaxException { + public URI toURI() { final StringBuilder userInfoBuffer = new StringBuilder(); if (username != null) { userInfoBuffer.append(urlencode(username)); @@ -70,7 +70,12 @@ public class Proxy { userInfoBuffer.append(":").append(urlencode(password)); } final String userInfo = StringUtils.defaultIfEmpty(userInfoBuffer.toString(), null); - final URI uri = new URI(scheme, userInfo, host, port, null, null, null); + URI uri; + try { + uri = new URI(scheme, userInfo, host, port, null, null, null); + } catch (URISyntaxException e) { + throw new IllegalArgumentException(e.getMessage(), e); + } return uri; } @@ -109,11 +114,7 @@ public class Proxy { @Override public String toString() { - try { - return this.toURI().toString(); - } catch (URISyntaxException e) { - throw new IllegalArgumentException(e); - } + return this.toURI().toString(); } } From 48bc73fbfff3c9bc38493ac262a4aa61720dcd80 Mon Sep 17 00:00:00 2001 From: Sutra Zhou Date: Wed, 24 Jun 2020 13:43:16 +0800 Subject: [PATCH 43/44] New method Proxy#create. --- .../us/codecraft/webmagic/proxy/Proxy.java | 15 +++ .../codecraft/webmagic/proxy/ProxyTest.java | 116 ++++++++++++------ 2 files changed, 93 insertions(+), 38 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/Proxy.java b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/Proxy.java index dffadba8..6554fab5 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/Proxy.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/Proxy.java @@ -20,6 +20,21 @@ public class Proxy { private String password; + public static Proxy create(final URI uri) { + Proxy proxy = new Proxy(uri.getHost(), uri.getPort(), uri.getScheme()); + String userInfo = uri.getUserInfo(); + if (userInfo != null) { + String[] up = userInfo.split(":"); + if (up.length == 1) { + proxy.username = up[0].isEmpty() ? null : up[0]; + } else { + proxy.username = up[0].isEmpty() ? null : up[0]; + proxy.password = up[1].isEmpty() ? null : up[1]; + } + } + return proxy; + } + public Proxy(String host, int port) { this(host, port, null); } diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/proxy/ProxyTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/proxy/ProxyTest.java index 89467013..8e4c8202 100644 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/proxy/ProxyTest.java +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/proxy/ProxyTest.java @@ -1,7 +1,9 @@ package us.codecraft.webmagic.proxy; import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertNull; +import java.net.URI; import java.util.ArrayList; import java.util.List; @@ -15,43 +17,81 @@ import org.junit.Test; */ public class ProxyTest { - private static List httpProxyList = new ArrayList(); - - @BeforeClass - public static void before() { - // String[] source = { "0.0.0.1:0", "0.0.0.2:0", "0.0.0.3:0", - // "0.0.0.4:0" }; - String[] source = { "::0.0.0.1:0", "::0.0.0.2:0", "::0.0.0.3:0", "::0.0.0.4:0" }; - for (String line : source) { - httpProxyList.add(new String[] {line.split(":")[0], line.split(":")[1], line.split(":")[2], line.split(":")[3] }); - } - } - - class Fetch extends Thread { - HttpHost hp; - - public Fetch(HttpHost hp) { - this.hp = hp; - } - - @Override - public void run() { - try { - System.out.println("fetch web page use proxy: " + hp.getHostName() + ":" + hp.getPort()); - sleep(500); - } catch (InterruptedException e) { - e.printStackTrace(); - } - } - } - - @Test - public void testToString() { - assertEquals("//127.0.0.1:8080", new Proxy("127.0.0.1", 8080).toString()); - assertEquals("http://127.0.0.1:8080", new Proxy("127.0.0.1", 8080, "http").toString()); - assertEquals("//username:password@127.0.0.1:8080", new Proxy("127.0.0.1", 8080, "username", "password").toString()); - assertEquals("//username@127.0.0.1:8080", new Proxy("127.0.0.1", 8080, "username", null).toString()); - assertEquals("//:password@127.0.0.1:8080", new Proxy("127.0.0.1", 8080, null, "password").toString()); - } + private static List httpProxyList = new ArrayList(); + + @BeforeClass + public static void before() { + // String[] source = { "0.0.0.1:0", "0.0.0.2:0", "0.0.0.3:0", + // "0.0.0.4:0" }; + String[] source = { "::0.0.0.1:0", "::0.0.0.2:0", "::0.0.0.3:0", "::0.0.0.4:0" }; + for (String line : source) { + httpProxyList.add(new String[] {line.split(":")[0], line.split(":")[1], line.split(":")[2], line.split(":")[3] }); + } + } + + class Fetch extends Thread { + HttpHost hp; + + public Fetch(HttpHost hp) { + this.hp = hp; + } + + @Override + public void run() { + try { + System.out.println("fetch web page use proxy: " + hp.getHostName() + ":" + hp.getPort()); + sleep(500); + } catch (InterruptedException e) { + e.printStackTrace(); + } + } + } + + @Test + public void testCreate() { + Proxy proxy = Proxy.create(URI.create("//127.0.0.1:8080")); + assertNull(proxy.getScheme()); + assertNull(proxy.getUsername()); + assertNull(proxy.getPassword()); + assertEquals("127.0.0.1", proxy.getHost()); + assertEquals(8080, proxy.getPort()); + + proxy = Proxy.create(URI.create("http://127.0.0.1:8080")); + assertEquals("http", proxy.getScheme()); + assertNull(proxy.getUsername()); + assertNull(proxy.getPassword()); + assertEquals("127.0.0.1", proxy.getHost()); + assertEquals(8080, proxy.getPort()); + + proxy = Proxy.create(URI.create("//username:password@127.0.0.1:8080")); + assertNull(proxy.getScheme()); + assertEquals("username", proxy.getUsername()); + assertEquals("password", proxy.getPassword()); + assertEquals("127.0.0.1", proxy.getHost()); + assertEquals(8080, proxy.getPort()); + + proxy = Proxy.create(URI.create("//username@127.0.0.1:8080")); + assertNull(proxy.getScheme()); + assertEquals("username", proxy.getUsername()); + assertNull(proxy.getPassword()); + assertEquals("127.0.0.1", proxy.getHost()); + assertEquals(8080, proxy.getPort()); + + proxy = Proxy.create(URI.create("//:password@127.0.0.1:8080")); + assertNull(proxy.getScheme()); + assertNull(proxy.getUsername()); + assertEquals("password", proxy.getPassword()); + assertEquals("127.0.0.1", proxy.getHost()); + assertEquals(8080, proxy.getPort()); + } + + @Test + public void testToString() { + assertEquals("//127.0.0.1:8080", new Proxy("127.0.0.1", 8080).toString()); + assertEquals("http://127.0.0.1:8080", new Proxy("127.0.0.1", 8080, "http").toString()); + assertEquals("//username:password@127.0.0.1:8080", new Proxy("127.0.0.1", 8080, "username", "password").toString()); + assertEquals("//username@127.0.0.1:8080", new Proxy("127.0.0.1", 8080, "username", null).toString()); + assertEquals("//:password@127.0.0.1:8080", new Proxy("127.0.0.1", 8080, null, "password").toString()); + } } From 9aab25f339b5aae1cb87bed7e1c30fae3bb5aef8 Mon Sep 17 00:00:00 2001 From: leeyazhou Date: Fri, 7 Aug 2020 16:36:32 +0800 Subject: [PATCH 44/44] build: manage plugin version & remove build WARNING ## use the new dependency of commons-io [WARNING] The artifact org.apache.commons:commons-io:jar:1.3.2 has been relocated to commons-io:commons-io:jar:1.3.2 ## manage plugin version of maven-jar-plugin and maven-deploy-plugin [WARNING] [WARNING] Some problems were encountered while building the effective model for us.codecraft:webmagic-core:jar:0.7.3 [WARNING] 'build.plugins.plugin.version' for org.apache.maven.plugins:maven-jar-plugin is missing. @ us.codecraft:webmagic-parent:0.7.3, /opt/code/git/webmagic/pom.xml, line 263, column 21 [WARNING] [WARNING] Some problems were encountered while building the effective model for us.codecraft:webmagic-extension:jar:0.7.3 [WARNING] 'build.plugins.plugin.version' for org.apache.maven.plugins:maven-jar-plugin is missing. @ us.codecraft:webmagic-parent:0.7.3, /opt/code/git/webmagic/pom.xml, line 263, column 21 [WARNING] [WARNING] Some problems were encountered while building the effective model for us.codecraft:webmagic-scripts:jar:0.7.3 [WARNING] 'build.plugins.plugin.version' for org.apache.maven.plugins:maven-jar-plugin is missing. @ line 61, column 21 [WARNING] [WARNING] Some problems were encountered while building the effective model for us.codecraft:webmagic-selenium:jar:0.7.3 [WARNING] 'build.plugins.plugin.version' for org.apache.maven.plugins:maven-jar-plugin is missing. @ us.codecraft:webmagic-parent:0.7.3, /opt/code/git/webmagic/pom.xml, line 263, column 21 [WARNING] 'build.plugins.plugin.version' for org.apache.maven.plugins:maven-deploy-plugin is missing. @ line 34, column 12 [WARNING] [WARNING] Some problems were encountered while building the effective model for us.codecraft:webmagic-saxon:jar:0.7.3 [WARNING] 'build.plugins.plugin.version' for org.apache.maven.plugins:maven-jar-plugin is missing. @ us.codecraft:webmagic-parent:0.7.3, /opt/code/git/webmagic/pom.xml, line 263, column 21 [WARNING] 'build.plugins.plugin.version' for org.apache.maven.plugins:maven-deploy-plugin is missing. @ line 34, column 21 [WARNING] [WARNING] Some problems were encountered while building the effective model for us.codecraft:webmagic-samples:jar:0.7.3 [WARNING] 'build.plugins.plugin.version' for org.apache.maven.plugins:maven-jar-plugin is missing. @ us.codecraft:webmagic-parent:0.7.3, /opt/code/git/webmagic/pom.xml, line 263, column 21 [WARNING] [WARNING] Some problems were encountered while building the effective model for us.codecraft:webmagic-parent:pom:0.7.3 [WARNING] 'build.plugins.plugin.version' for org.apache.maven.plugins:maven-jar-plugin is missing. @ line 263, column 21 [WARNING] [WARNING] It is highly recommended to fix these problems because they threaten the stability of your build. [WARNING] [WARNING] For this reason, future Maven versions might no longer support building such malformed projects. --- pom.xml | 9 +++++---- webmagic-core/pom.xml | 2 +- webmagic-saxon/pom.xml | 2 ++ webmagic-selenium/pom.xml | 2 ++ 4 files changed, 10 insertions(+), 5 deletions(-) diff --git a/pom.xml b/pom.xml index 1a5853ad..d016d0a9 100644 --- a/pom.xml +++ b/pom.xml @@ -139,10 +139,10 @@ 3.2.2
- org.apache.commons - commons-io - 1.3.2 - + commons-io + commons-io + 2.7 +
org.codehaus.groovy groovy-all @@ -263,6 +263,7 @@ org.apache.maven.plugins maven-jar-plugin + 3.2.0 log4j.xml diff --git a/webmagic-core/pom.xml b/webmagic-core/pom.xml index 4bc074da..44fb7fa4 100644 --- a/webmagic-core/pom.xml +++ b/webmagic-core/pom.xml @@ -67,7 +67,7 @@ - org.apache.commons + commons-io commons-io diff --git a/webmagic-saxon/pom.xml b/webmagic-saxon/pom.xml index ae1454b5..da0c5f20 100644 --- a/webmagic-saxon/pom.xml +++ b/webmagic-saxon/pom.xml @@ -32,7 +32,9 @@ + org.apache.maven.plugins maven-deploy-plugin + 3.0.0-M1 true diff --git a/webmagic-selenium/pom.xml b/webmagic-selenium/pom.xml index b5d09695..dfc4a195 100644 --- a/webmagic-selenium/pom.xml +++ b/webmagic-selenium/pom.xml @@ -32,7 +32,9 @@ + org.apache.maven.plugins maven-deploy-plugin + 3.0.0-M1 true