From b539522ca8431d804b95b2ced414e5b43415e9f6 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Wed, 29 Nov 2017 13:36:19 +0800 Subject: [PATCH 1/7] #701 support to tls1.2 --- pom.xml | 5 ++++ .../downloader/HttpClientGenerator.java | 5 +++- .../downloader/HttpClientDownloaderTest.java | 1 + .../downloader/SSLCompatibilityTest.java | 26 +++++++++++++++++++ 4 files changed, 36 insertions(+), 1 deletion(-) create mode 100644 webmagic-core/src/test/java/us/codecraft/webmagic/downloader/SSLCompatibilityTest.java diff --git a/pom.xml b/pom.xml index 0765ae13..84ce1152 100644 --- a/pom.xml +++ b/pom.xml @@ -75,6 +75,11 @@ httpclient 4.5.2 + + org.apache.httpcomponents + httpcore + 4.5.2 + com.google.guava guava diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java index 562f36f6..28a16f41 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java @@ -9,6 +9,7 @@ import org.apache.http.config.RegistryBuilder; import org.apache.http.config.SocketConfig; import org.apache.http.conn.socket.ConnectionSocketFactory; import org.apache.http.conn.socket.PlainConnectionSocketFactory; +import org.apache.http.conn.ssl.DefaultHostnameVerifier; import org.apache.http.conn.ssl.SSLConnectionSocketFactory; import org.apache.http.impl.client.*; import org.apache.http.impl.conn.PoolingHttpClientConnectionManager; @@ -49,7 +50,9 @@ public class HttpClientGenerator { private SSLConnectionSocketFactory buildSSLConnectionSocketFactory() { try { - return new SSLConnectionSocketFactory(createIgnoreVerifySSL()); // 优先绕过安全证书 + return new SSLConnectionSocketFactory(createIgnoreVerifySSL(), new String[]{"SSLv3", "TLSv1", "TLSv1.1", "TLSv1.2"}, + null, + new DefaultHostnameVerifier()); // 优先绕过安全证书 } catch (KeyManagementException e) { logger.error("ssl connection fail", e); } catch (NoSuchAlgorithmException e) { diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpClientDownloaderTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpClientDownloaderTest.java index 04a45a02..ece06000 100644 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpClientDownloaderTest.java +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpClientDownloaderTest.java @@ -322,4 +322,5 @@ public class HttpClientDownloaderTest { }); } + } diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/SSLCompatibilityTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/SSLCompatibilityTest.java new file mode 100644 index 00000000..861b315a --- /dev/null +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/SSLCompatibilityTest.java @@ -0,0 +1,26 @@ +package us.codecraft.webmagic.downloader; + +import org.junit.Test; +import us.codecraft.webmagic.Page; +import us.codecraft.webmagic.Request; +import us.codecraft.webmagic.Site; +import us.codecraft.webmagic.Task; + +import static org.assertj.core.api.Assertions.assertThat; + +/** + * @author code4crafter@gmail.com + * Date: 2017/11/29 + * Time: 下午1:32 + */ +public class SSLCompatibilityTest { + + @Test + public void test_tls12() throws Exception { + HttpClientDownloader httpClientDownloader = new HttpClientDownloader(); + Task task = Site.me().setCycleRetryTimes(5).toTask(); + Request request = new Request("https://juejin.im/"); + Page page = httpClientDownloader.download(request, task); + assertThat(page.isDownloadSuccess()).isTrue(); + } +} From e5db538c19188902592ea2f702e0860fc3eba600 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Wed, 29 Nov 2017 13:49:40 +0800 Subject: [PATCH 2/7] #647 remove ThreadSafe annotation --- .../us/codecraft/webmagic/downloader/HttpClientDownloader.java | 2 -- .../main/java/us/codecraft/webmagic/pipeline/FilePipeline.java | 3 --- .../us/codecraft/webmagic/scheduler/PriorityScheduler.java | 2 -- .../java/us/codecraft/webmagic/scheduler/QueueScheduler.java | 2 -- .../us/codecraft/webmagic/downloader/PhantomJSDownloader.java | 2 -- 5 files changed, 11 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java index fff7c7cf..24889c88 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java @@ -2,7 +2,6 @@ package us.codecraft.webmagic.downloader; import org.apache.commons.io.IOUtils; import org.apache.http.HttpResponse; -import org.apache.http.annotation.ThreadSafe; import org.apache.http.client.methods.CloseableHttpResponse; import org.apache.http.impl.client.CloseableHttpClient; import org.apache.http.util.EntityUtils; @@ -30,7 +29,6 @@ import java.util.Map; * @author code4crafter@gmail.com
* @since 0.1.0 */ -@ThreadSafe public class HttpClientDownloader extends AbstractDownloader { private Logger logger = LoggerFactory.getLogger(getClass()); diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/FilePipeline.java b/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/FilePipeline.java index 57d6eea3..be9fd7cc 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/FilePipeline.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/FilePipeline.java @@ -1,10 +1,8 @@ package us.codecraft.webmagic.pipeline; import org.apache.commons.codec.digest.DigestUtils; -import org.apache.http.annotation.ThreadSafe; import org.slf4j.Logger; import org.slf4j.LoggerFactory; - import us.codecraft.webmagic.ResultItems; import us.codecraft.webmagic.Task; import us.codecraft.webmagic.utils.FilePersistentBase; @@ -21,7 +19,6 @@ import java.util.Map; * @author code4crafter@gmail.com
* @since 0.1.0 */ -@ThreadSafe public class FilePipeline extends FilePersistentBase implements Pipeline { private Logger logger = LoggerFactory.getLogger(getClass()); diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/PriorityScheduler.java b/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/PriorityScheduler.java index 8fa1b9ea..14cbaff3 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/PriorityScheduler.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/PriorityScheduler.java @@ -1,6 +1,5 @@ package us.codecraft.webmagic.scheduler; -import org.apache.http.annotation.ThreadSafe; import us.codecraft.webmagic.Request; import us.codecraft.webmagic.Task; import us.codecraft.webmagic.utils.NumberUtils; @@ -16,7 +15,6 @@ import java.util.concurrent.PriorityBlockingQueue; * @author code4crafter@gmail.com
* @since 0.2.1 */ -@ThreadSafe public class PriorityScheduler extends DuplicateRemovedScheduler implements MonitorableScheduler { public static final int INITIAL_CAPACITY = 5; diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/QueueScheduler.java b/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/QueueScheduler.java index 078506c6..f9ad0e98 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/QueueScheduler.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/QueueScheduler.java @@ -1,6 +1,5 @@ package us.codecraft.webmagic.scheduler; -import org.apache.http.annotation.ThreadSafe; import us.codecraft.webmagic.Request; import us.codecraft.webmagic.Task; @@ -15,7 +14,6 @@ import java.util.concurrent.LinkedBlockingQueue; * @author code4crafter@gmail.com
* @since 0.1.0 */ -@ThreadSafe public class QueueScheduler extends DuplicateRemovedScheduler implements MonitorableScheduler { private BlockingQueue queue = new LinkedBlockingQueue(); diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/downloader/PhantomJSDownloader.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/downloader/PhantomJSDownloader.java index 0fda351b..6055bdb0 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/downloader/PhantomJSDownloader.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/downloader/PhantomJSDownloader.java @@ -1,6 +1,5 @@ package us.codecraft.webmagic.downloader; -import org.apache.http.annotation.ThreadSafe; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import us.codecraft.webmagic.Page; @@ -16,7 +15,6 @@ import java.io.*; * @author dolphineor@gmail.com * @version 0.5.3 */ -@ThreadSafe public class PhantomJSDownloader extends AbstractDownloader { private static Logger logger = LoggerFactory.getLogger(PhantomJSDownloader.class); From 266083fa074819232a02d359566be81ff687da87 Mon Sep 17 00:00:00 2001 From: yihy <2100087178@qq.com> Date: Wed, 29 Nov 2017 20:19:00 +0800 Subject: [PATCH 3/7] =?UTF-8?q?[Fix]=20#698=C2=A0=20Repair=20using=20redis?= =?UTF-8?q?,Request=20additional=20information=20is=20lost?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../webmagic/scheduler/RedisScheduler.java | 35 ++++++++++++++++--- 1 file changed, 31 insertions(+), 4 deletions(-) diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java index ce1111f2..1e94971f 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java @@ -2,6 +2,7 @@ package us.codecraft.webmagic.scheduler; import com.alibaba.fastjson.JSON; import org.apache.commons.codec.digest.DigestUtils; +import org.apache.commons.lang3.StringUtils; import redis.clients.jedis.Jedis; import redis.clients.jedis.JedisPool; import redis.clients.jedis.JedisPoolConfig; @@ -60,7 +61,7 @@ public class RedisScheduler extends DuplicateRemovedScheduler implements Monitor Jedis jedis = pool.getResource(); try { jedis.rpush(getQueueKey(task), request.getUrl()); - if (request.getExtras() != null) { + if (CheckForAdditionalInfo(request)) { String field = DigestUtils.shaHex(request.getUrl()); String value = JSON.toJSONString(request); jedis.hset((ITEM_PREFIX + task.getUUID()), field, value); @@ -70,6 +71,33 @@ public class RedisScheduler extends DuplicateRemovedScheduler implements Monitor } } + private boolean CheckForAdditionalInfo(Request request) { + if (request == null) { + return false; + } + + if (!request.getHeaders().isEmpty() || !request.getCookies().isEmpty()) { + return true; + } + + if (StringUtils.isNotBlank(request.getCharset()) || StringUtils.isNotBlank(request.getMethod())) { + return true; + } + + if (request.isBinaryContent() || request.getRequestBody() != null) { + return true; + } + + if (request.getExtras() != null && !request.getExtras().isEmpty()) { + return true; + } + if (request.getPriority() != 0L) { + return true; + } + + return false; + } + @Override public synchronized Request poll(Task task) { Jedis jedis = pool.getResource(); @@ -85,7 +113,7 @@ public class RedisScheduler extends DuplicateRemovedScheduler implements Monitor Request o = JSON.parseObject(new String(bytes), Request.class); return o; } - Request request = new Request(url); + Request request = new Request(url); return request; } finally { pool.returnResource(jedis); @@ -100,8 +128,7 @@ public class RedisScheduler extends DuplicateRemovedScheduler implements Monitor return QUEUE_PREFIX + task.getUUID(); } - protected String getItemKey(Task task) - { + protected String getItemKey(Task task) { return ITEM_PREFIX + task.getUUID(); } From adf545483797392333135a3d7900b31b71110d9d Mon Sep 17 00:00:00 2001 From: yihy <2100087178@qq.com> Date: Thu, 30 Nov 2017 11:35:12 +0800 Subject: [PATCH 4/7] =?UTF-8?q?[Fix]=20=E4=BF=AE=E6=AD=A3=E9=94=99?= =?UTF-8?q?=E8=AF=AF=E6=96=B9=E6=B3=95=E5=90=8D?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../java/us/codecraft/webmagic/scheduler/RedisScheduler.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java index 1e94971f..ee04f35c 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java @@ -61,7 +61,7 @@ public class RedisScheduler extends DuplicateRemovedScheduler implements Monitor Jedis jedis = pool.getResource(); try { jedis.rpush(getQueueKey(task), request.getUrl()); - if (CheckForAdditionalInfo(request)) { + if (checkForAdditionalInfo(request)) { String field = DigestUtils.shaHex(request.getUrl()); String value = JSON.toJSONString(request); jedis.hset((ITEM_PREFIX + task.getUUID()), field, value); @@ -71,7 +71,7 @@ public class RedisScheduler extends DuplicateRemovedScheduler implements Monitor } } - private boolean CheckForAdditionalInfo(Request request) { + private boolean checkForAdditionalInfo(Request request) { if (request == null) { return false; } From c701fe8d38c8060e97df3efab64ae4a0d94c0245 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Thu, 30 Nov 2017 11:50:52 +0800 Subject: [PATCH 5/7] #702 Refactor: rename CheckForAdditionalInfo to checkForAdditionalInfo --- .../us/codecraft/webmagic/scheduler/RedisScheduler.java | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java index 1e94971f..c70d8850 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java @@ -61,17 +61,17 @@ public class RedisScheduler extends DuplicateRemovedScheduler implements Monitor Jedis jedis = pool.getResource(); try { jedis.rpush(getQueueKey(task), request.getUrl()); - if (CheckForAdditionalInfo(request)) { + if (checkForAdditionalInfo(request)) { String field = DigestUtils.shaHex(request.getUrl()); String value = JSON.toJSONString(request); jedis.hset((ITEM_PREFIX + task.getUUID()), field, value); } } finally { - pool.returnResource(jedis); + jedis.close(); } } - private boolean CheckForAdditionalInfo(Request request) { + private boolean checkForAdditionalInfo(Request request) { if (request == null) { return false; } From c7d1ed7d201515fbf479dcb62c612711af56070a Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Sat, 2 Dec 2017 10:50:49 +0800 Subject: [PATCH 6/7] #fix httpcore version: change to 4.4.4 --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index 84ce1152..2b2384fd 100644 --- a/pom.xml +++ b/pom.xml @@ -78,7 +78,7 @@ org.apache.httpcomponents httpcore - 4.5.2 + 4.4.4 com.google.guava From be892b80bf6682cd063d30ac25a79be0c079a901 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Sat, 2 Dec 2017 10:57:06 +0800 Subject: [PATCH 7/7] update travis ci to openjdk --- .travis.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index a9f233f3..9e6f78d3 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,3 +1,3 @@ language: java jdk: - - oraclejdk7 + - openjdk7