diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java index edbf934d..4879b282 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java @@ -40,12 +40,8 @@ public class Site { private static final Set DEFAULT_STATUS_CODE_SET = new HashSet(); - private static final Set DEFAULT_REFRESH_CODE_SET = new HashSet<>(); - - private Set refreshCode = DEFAULT_REFRESH_CODE_SET; private Set acceptStatCode = DEFAULT_STATUS_CODE_SET; - private Map headers = new HashMap(); private boolean useGzip = true; @@ -53,7 +49,6 @@ public class Site { private boolean disableCookieManagement = false; static { - DEFAULT_REFRESH_CODE_SET.add(HttpConstant.StatusCode.FORBIDDEN); DEFAULT_STATUS_CODE_SET.add(HttpConstant.StatusCode.CODE_200); } @@ -202,15 +197,6 @@ public class Site { return this; } - public Site setRefreshCode(Set refreshCode){ - this.refreshCode = refreshCode; - return this; - } - public Set getRefreshCode(){ - return refreshCode; - - } - /** * get acceptStatCode * diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java index 474b7433..a5ac8aa2 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java @@ -424,10 +424,7 @@ public class Spider implements Runnable, Task { pipeline.process(page.getResultItems(), this); } } - } else if(site.getRefreshCode().contains(page.getStatusCode())) { - logger.info("page status code error, page {} , code: {}, start refresh downloader", request.getUrl(), page.getStatusCode()); - downloader.refreshComponent(this); - }else { + } else { logger.info("page status code error, page {} , code: {}", request.getUrl(), page.getStatusCode()); } sleep(site.getSleepTime()); diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/Downloader.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/Downloader.java index 50955012..f7ced493 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/Downloader.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/Downloader.java @@ -18,18 +18,14 @@ public interface Downloader { * Downloads web pages and store in Page object. * * @param request request - * @param task task + * @param task task * @return page */ - Page download(Request request, Task task); + public Page download(Request request, Task task); /** * Tell the downloader how many threads the spider used. - * * @param threadNum number of threads */ - void setThread(int threadNum); - - - void refreshComponent(Task task); + public void setThread(int threadNum); } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java index eed49fb4..f9f8c829 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java @@ -111,17 +111,6 @@ public class HttpClientDownloader extends AbstractDownloader { } } - - @Override - public void refreshComponent(Task task) { - if (proxyProvider != null ) { - proxyProvider.refreshProxy(task); - } - - httpClients.remove(task.getSite().getDomain()); - - } - @Override public void setThread(int thread) { httpClientGenerator.setPoolSize(thread); diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java index 1f20c5a5..80e0f108 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java @@ -1,17 +1,13 @@ package us.codecraft.webmagic.downloader; -import java.io.File; import java.io.IOException; import java.security.KeyManagementException; -import java.security.KeyStore; -import java.security.KeyStoreException; import java.security.NoSuchAlgorithmException; import java.security.cert.CertificateException; import java.security.cert.X509Certificate; import java.util.Map; import javax.net.ssl.SSLContext; -import javax.net.ssl.SSLContextSpi; import javax.net.ssl.TrustManager; import javax.net.ssl.X509TrustManager; @@ -28,7 +24,6 @@ import org.apache.http.conn.socket.ConnectionSocketFactory; import org.apache.http.conn.socket.PlainConnectionSocketFactory; import org.apache.http.conn.ssl.DefaultHostnameVerifier; import org.apache.http.conn.ssl.SSLConnectionSocketFactory; -import org.apache.http.conn.ssl.TrustSelfSignedStrategy; import org.apache.http.impl.client.BasicCookieStore; import org.apache.http.impl.client.CloseableHttpClient; import org.apache.http.impl.client.DefaultHttpRequestRetryHandler; @@ -37,7 +32,6 @@ import org.apache.http.impl.client.HttpClients; import org.apache.http.impl.conn.PoolingHttpClientConnectionManager; import org.apache.http.impl.cookie.BasicClientCookie; import org.apache.http.protocol.HttpContext; -import org.apache.http.ssl.SSLContexts; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -75,7 +69,7 @@ public class HttpClientGenerator { return new SSLConnectionSocketFactory(sslContext, supportedProtocols, null, new DefaultHostnameVerifier()); // 优先绕过安全证书 - } catch (KeyManagementException | CertificateException | KeyStoreException | IOException e) { + } catch (KeyManagementException e) { logger.error("ssl connection fail", e); } catch (NoSuchAlgorithmException e) { logger.error("ssl connection fail", e); @@ -83,8 +77,8 @@ public class HttpClientGenerator { return SSLConnectionSocketFactory.getSocketFactory(); } - private SSLContext createIgnoreVerifySSL() throws NoSuchAlgorithmException, KeyManagementException, CertificateException, KeyStoreException, IOException { -// 实现一个X509TrustManager接口,用于绕过验证,不用修改里面的方法 + private SSLContext createIgnoreVerifySSL() throws NoSuchAlgorithmException, KeyManagementException { + // 实现一个X509TrustManager接口,用于绕过验证,不用修改里面的方法 X509TrustManager trustManager = new X509TrustManager() { @Override @@ -102,10 +96,10 @@ public class HttpClientGenerator { }; - SSLContext sc = SSLContext.getInstance("SSLv3"); + SSLContext sc = SSLContext.getInstance("TLS"); sc.init(null, new TrustManager[] { trustManager }, null); return sc; - } + } public HttpClientGenerator setPoolSize(int poolSize) { connectionManager.setMaxTotal(poolSize); diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/utils/HttpConstant.java b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/HttpConstant.java index bfacec35..2d6b8fe2 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/utils/HttpConstant.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/HttpConstant.java @@ -28,7 +28,6 @@ public abstract class HttpConstant { public static abstract class StatusCode { public static final int CODE_200 = 200; - public static final int FORBIDDEN = 403; } diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/SpiderTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/SpiderTest.java index 6b9c4232..4f4a2806 100644 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/SpiderTest.java +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/SpiderTest.java @@ -57,11 +57,6 @@ public class SpiderTest { return Site.me().setSleepTime(0); } }).setDownloader(new Downloader() { - @Override - public void refreshComponent(Task task) { - - } - @Override public Page download(Request request, Task task) { return new Page().setRawText(""); diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/MockGithubDownloader.java b/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/MockGithubDownloader.java index 6d764a59..3aa742c1 100644 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/MockGithubDownloader.java +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/MockGithubDownloader.java @@ -28,11 +28,6 @@ public class MockGithubDownloader implements Downloader { return page; } - @Override - public void refreshComponent(Task task) { - - } - @Override public void setThread(int threadNum) { } diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/downloader/PhantomJSDownloader.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/downloader/PhantomJSDownloader.java index f3751d65..6055bdb0 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/downloader/PhantomJSDownloader.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/downloader/PhantomJSDownloader.java @@ -42,12 +42,7 @@ public class PhantomJSDownloader extends AbstractDownloader { this.initPhantomjsCrawlPath(); PhantomJSDownloader.phantomJsCommand = phantomJsCommand; } - - @Override - public void refreshComponent(Task task) { - - } - + /** * 新增构造函数,支持crawl.js路径自定义,因为当其他项目依赖此jar包时,runtime.exec()执行phantomjs命令时无使用法jar包中的crawl.js *
diff --git a/webmagic-extension/src/test/java/us/codecraft/webmagic/downloader/MockGithubDownloader.java b/webmagic-extension/src/test/java/us/codecraft/webmagic/downloader/MockGithubDownloader.java
index 77446929..91e3698c 100644
--- a/webmagic-extension/src/test/java/us/codecraft/webmagic/downloader/MockGithubDownloader.java
+++ b/webmagic-extension/src/test/java/us/codecraft/webmagic/downloader/MockGithubDownloader.java
@@ -9,10 +9,6 @@ import us.codecraft.webmagic.selector.PlainText;
  * @author code4crafter@gmail.com
  */
 public class MockGithubDownloader implements Downloader{
-    @Override
-    public void refreshComponent(Task task) {
-
-    }
 
     private String html = "\n" +
             "\n" +
diff --git a/webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/SeleniumDownloader.java b/webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/SeleniumDownloader.java
index 11b23562..cce293fc 100644
--- a/webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/SeleniumDownloader.java
+++ b/webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/SeleniumDownloader.java
@@ -59,11 +59,6 @@ public class SeleniumDownloader implements Downloader, Closeable {
 		// "/Users/Bingo/Downloads/phantomjs-1.9.7-macosx/bin/phantomjs");
 	}
 
-	@Override
-	public void refreshComponent(Task task) {
-
-	}
-
 	/**
 	 * set sleep time to wait until load success
 	 *