Merge pull request #1 from code4craft/master

update
pull/893/head
qingmo 5 years ago committed by GitHub
commit 77997ca14b
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -1,3 +1,3 @@
language: java
jdk:
- oraclejdk7
- openjdk7

@ -75,6 +75,11 @@
<artifactId>httpclient</artifactId>
<version>4.5.2</version>
</dependency>
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpcore</artifactId>
<version>4.4.4</version>
</dependency>
<dependency>
<groupId>com.google.guava</groupId>
<artifactId>guava</artifactId>

@ -2,7 +2,6 @@ package us.codecraft.webmagic.downloader;
import org.apache.commons.io.IOUtils;
import org.apache.http.HttpResponse;
import org.apache.http.annotation.ThreadSafe;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.util.EntityUtils;
@ -30,7 +29,6 @@ import java.util.Map;
* @author code4crafter@gmail.com <br>
* @since 0.1.0
*/
@ThreadSafe
public class HttpClientDownloader extends AbstractDownloader {
private Logger logger = LoggerFactory.getLogger(getClass());

@ -9,6 +9,7 @@ import org.apache.http.config.RegistryBuilder;
import org.apache.http.config.SocketConfig;
import org.apache.http.conn.socket.ConnectionSocketFactory;
import org.apache.http.conn.socket.PlainConnectionSocketFactory;
import org.apache.http.conn.ssl.DefaultHostnameVerifier;
import org.apache.http.conn.ssl.SSLConnectionSocketFactory;
import org.apache.http.impl.client.*;
import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;
@ -49,7 +50,9 @@ public class HttpClientGenerator {
private SSLConnectionSocketFactory buildSSLConnectionSocketFactory() {
try {
return new SSLConnectionSocketFactory(createIgnoreVerifySSL()); // 优先绕过安全证书
return new SSLConnectionSocketFactory(createIgnoreVerifySSL(), new String[]{"SSLv3", "TLSv1", "TLSv1.1", "TLSv1.2"},
null,
new DefaultHostnameVerifier()); // 优先绕过安全证书
} catch (KeyManagementException e) {
logger.error("ssl connection fail", e);
} catch (NoSuchAlgorithmException e) {

@ -1,10 +1,8 @@
package us.codecraft.webmagic.pipeline;
import org.apache.commons.codec.digest.DigestUtils;
import org.apache.http.annotation.ThreadSafe;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import us.codecraft.webmagic.ResultItems;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.utils.FilePersistentBase;
@ -21,7 +19,6 @@ import java.util.Map;
* @author code4crafter@gmail.com <br>
* @since 0.1.0
*/
@ThreadSafe
public class FilePipeline extends FilePersistentBase implements Pipeline {
private Logger logger = LoggerFactory.getLogger(getClass());

@ -1,6 +1,5 @@
package us.codecraft.webmagic.scheduler;
import org.apache.http.annotation.ThreadSafe;
import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.utils.NumberUtils;
@ -16,7 +15,6 @@ import java.util.concurrent.PriorityBlockingQueue;
* @author code4crafter@gmail.com <br>
* @since 0.2.1
*/
@ThreadSafe
public class PriorityScheduler extends DuplicateRemovedScheduler implements MonitorableScheduler {
public static final int INITIAL_CAPACITY = 5;

@ -1,6 +1,5 @@
package us.codecraft.webmagic.scheduler;
import org.apache.http.annotation.ThreadSafe;
import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Task;
@ -15,7 +14,6 @@ import java.util.concurrent.LinkedBlockingQueue;
* @author code4crafter@gmail.com <br>
* @since 0.1.0
*/
@ThreadSafe
public class QueueScheduler extends DuplicateRemovedScheduler implements MonitorableScheduler {
private BlockingQueue<Request> queue = new LinkedBlockingQueue<Request>();

@ -0,0 +1,26 @@
package us.codecraft.webmagic.downloader;
import org.junit.Test;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Task;
import static org.assertj.core.api.Assertions.assertThat;
/**
* @author code4crafter@gmail.com
* Date: 2017/11/29
* Time: 1:32
*/
public class SSLCompatibilityTest {
@Test
public void test_tls12() throws Exception {
HttpClientDownloader httpClientDownloader = new HttpClientDownloader();
Task task = Site.me().setCycleRetryTimes(5).toTask();
Request request = new Request("https://juejin.im/");
Page page = httpClientDownloader.download(request, task);
assertThat(page.isDownloadSuccess()).isTrue();
}
}

@ -1,6 +1,5 @@
package us.codecraft.webmagic.downloader;
import org.apache.http.annotation.ThreadSafe;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import us.codecraft.webmagic.Page;
@ -16,7 +15,6 @@ import java.io.*;
* @author dolphineor@gmail.com
* @version 0.5.3
*/
@ThreadSafe
public class PhantomJSDownloader extends AbstractDownloader {
private static Logger logger = LoggerFactory.getLogger(PhantomJSDownloader.class);

@ -2,6 +2,7 @@ package us.codecraft.webmagic.scheduler;
import com.alibaba.fastjson.JSON;
import org.apache.commons.codec.digest.DigestUtils;
import org.apache.commons.lang3.StringUtils;
import redis.clients.jedis.Jedis;
import redis.clients.jedis.JedisPool;
import redis.clients.jedis.JedisPoolConfig;
@ -60,14 +61,41 @@ public class RedisScheduler extends DuplicateRemovedScheduler implements Monitor
Jedis jedis = pool.getResource();
try {
jedis.rpush(getQueueKey(task), request.getUrl());
if (request.getExtras() != null) {
if (checkForAdditionalInfo(request)) {
String field = DigestUtils.shaHex(request.getUrl());
String value = JSON.toJSONString(request);
jedis.hset((ITEM_PREFIX + task.getUUID()), field, value);
}
} finally {
pool.returnResource(jedis);
jedis.close();
}
}
private boolean checkForAdditionalInfo(Request request) {
if (request == null) {
return false;
}
if (!request.getHeaders().isEmpty() || !request.getCookies().isEmpty()) {
return true;
}
if (StringUtils.isNotBlank(request.getCharset()) || StringUtils.isNotBlank(request.getMethod())) {
return true;
}
if (request.isBinaryContent() || request.getRequestBody() != null) {
return true;
}
if (request.getExtras() != null && !request.getExtras().isEmpty()) {
return true;
}
if (request.getPriority() != 0L) {
return true;
}
return false;
}
@Override
@ -85,7 +113,7 @@ public class RedisScheduler extends DuplicateRemovedScheduler implements Monitor
Request o = JSON.parseObject(new String(bytes), Request.class);
return o;
}
Request request = new Request(url);
Request request = new Request(url);
return request;
} finally {
pool.returnResource(jedis);
@ -100,8 +128,7 @@ public class RedisScheduler extends DuplicateRemovedScheduler implements Monitor
return QUEUE_PREFIX + task.getUUID();
}
protected String getItemKey(Task task)
{
protected String getItemKey(Task task) {
return ITEM_PREFIX + task.getUUID();
}

Loading…
Cancel
Save