重构一部分httpclient

pull/524/head
yihua.huang 8 years ago
parent 221c155060
commit a7f9e7cad5

@ -4,7 +4,7 @@ import org.apache.http.HttpHost;
import org.apache.http.auth.UsernamePasswordCredentials; import org.apache.http.auth.UsernamePasswordCredentials;
import us.codecraft.webmagic.proxy.Proxy; import us.codecraft.webmagic.proxy.Proxy;
import us.codecraft.webmagic.proxy.ProxyPool; import us.codecraft.webmagic.proxy.ProxyPool;
import us.codecraft.webmagic.proxy.SimpleProxyPool; import us.codecraft.webmagic.proxy.TimerReuseProxyPool;
import us.codecraft.webmagic.utils.UrlUtils; import us.codecraft.webmagic.utils.UrlUtils;
import java.util.*; import java.util.*;
@ -487,12 +487,12 @@ public class Site {
* @return this * @return this
*/ */
public Site setHttpProxyPool(List<String[]> httpProxyList, boolean isUseLastProxy) { public Site setHttpProxyPool(List<String[]> httpProxyList, boolean isUseLastProxy) {
this.httpProxyPool=new SimpleProxyPool(httpProxyList, isUseLastProxy); this.httpProxyPool=new TimerReuseProxyPool(httpProxyList, isUseLastProxy);
return this; return this;
} }
public Site enableHttpProxyPool() { public Site enableHttpProxyPool() {
this.httpProxyPool=new SimpleProxyPool(); this.httpProxyPool=new TimerReuseProxyPool();
return this; return this;
} }

@ -3,16 +3,16 @@ package us.codecraft.webmagic.downloader;
import org.apache.commons.io.IOUtils; import org.apache.commons.io.IOUtils;
import org.apache.http.HttpHost; import org.apache.http.HttpHost;
import org.apache.http.HttpResponse; import org.apache.http.HttpResponse;
import org.apache.http.NameValuePair;
import org.apache.http.annotation.ThreadSafe; import org.apache.http.annotation.ThreadSafe;
import org.apache.http.client.config.CookieSpecs; import org.apache.http.auth.AuthState;
import org.apache.http.client.config.RequestConfig; import org.apache.http.auth.UsernamePasswordCredentials;
import org.apache.http.client.entity.UrlEncodedFormEntity;
import org.apache.http.client.methods.CloseableHttpResponse; import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpUriRequest; import org.apache.http.client.methods.HttpUriRequest;
import org.apache.http.client.methods.RequestBuilder; import org.apache.http.client.protocol.HttpClientContext;
import org.apache.http.impl.auth.BasicScheme;
import org.apache.http.impl.client.CloseableHttpClient; import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.message.BasicNameValuePair; import org.apache.http.protocol.BasicHttpContext;
import org.apache.http.protocol.HttpContext;
import org.apache.http.util.EntityUtils; import org.apache.http.util.EntityUtils;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
@ -23,12 +23,13 @@ import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.proxy.Proxy; import us.codecraft.webmagic.proxy.Proxy;
import us.codecraft.webmagic.selector.PlainText; import us.codecraft.webmagic.selector.PlainText;
import us.codecraft.webmagic.utils.CharsetUtils; import us.codecraft.webmagic.utils.CharsetUtils;
import us.codecraft.webmagic.utils.HttpConstant;
import us.codecraft.webmagic.utils.WMCollections; import us.codecraft.webmagic.utils.WMCollections;
import java.io.IOException; import java.io.IOException;
import java.nio.charset.Charset; import java.nio.charset.Charset;
import java.util.*; import java.util.HashMap;
import java.util.Map;
import java.util.Set;
/** /**
@ -46,9 +47,15 @@ public class HttpClientDownloader extends AbstractDownloader {
private HttpClientGenerator httpClientGenerator = new HttpClientGenerator(); private HttpClientGenerator httpClientGenerator = new HttpClientGenerator();
private CloseableHttpClient getHttpClient(Site site, Proxy proxy) { private HttpUriRequestConverter httpUriRequestConverter = new HttpUriRequestConverter();
public void setHttpUriRequestConverter(HttpUriRequestConverter httpUriRequestConverter) {
this.httpUriRequestConverter = httpUriRequestConverter;
}
private CloseableHttpClient getHttpClient(Site site) {
if (site == null) { if (site == null) {
return httpClientGenerator.getClient(null, proxy); return httpClientGenerator.getClient(null);
} }
String domain = site.getDomain(); String domain = site.getDomain();
CloseableHttpClient httpClient = httpClients.get(domain); CloseableHttpClient httpClient = httpClients.get(domain);
@ -56,7 +63,7 @@ public class HttpClientDownloader extends AbstractDownloader {
synchronized (this) { synchronized (this) {
httpClient = httpClients.get(domain); httpClient = httpClients.get(domain);
if (httpClient == null) { if (httpClient == null) {
httpClient = httpClientGenerator.getClient(site, proxy); httpClient = httpClientGenerator.getClient(site);
httpClients.put(domain, httpClient); httpClients.put(domain, httpClient);
} }
} }
@ -66,35 +73,31 @@ public class HttpClientDownloader extends AbstractDownloader {
@Override @Override
public Page download(Request request, Task task) { public Page download(Request request, Task task) {
Site site = null; if (task == null || task.getSite() == null) {
if (task != null) { throw new NullPointerException("task or site can not be null");
site = task.getSite();
} }
Set<Integer> acceptStatCode; logger.debug("downloading page {}", request.getUrl());
String charset = null;
Map<String, String> headers = null;
if (site != null) {
acceptStatCode = site.getAcceptStatCode();
charset = site.getCharset();
headers = site.getHeaders();
} else {
acceptStatCode = WMCollections.newHashSet(200);
}
logger.info("downloading page {}", request.getUrl());
CloseableHttpResponse httpResponse = null; CloseableHttpResponse httpResponse = null;
int statusCode = 0; int statusCode = 0;
Site site = task.getSite();
try { try {
HttpHost proxyHost = null; Proxy proxy = null;
Proxy proxy = null; //TODO if (site.getHttpProxyPool() != null && site.getHttpProxyPool().isEnable()) {
if (site != null && site.getHttpProxyPool() != null && site.getHttpProxyPool().isEnable()) {
proxy = site.getHttpProxyFromPool(); proxy = site.getHttpProxyFromPool();
proxyHost = proxy.getHttpHost();
} else if (site != null && site.getHttpProxy() != null){ } else if (site != null && site.getHttpProxy() != null){
proxyHost = site.getHttpProxy(); proxy = site.getHttpProxy();
request.putExtra(Request.PROXY, site.getHttpProxy());
} }
request.putExtra(Request.PROXY, proxy);
HttpUriRequest httpUriRequest = getHttpUriRequest(request, site, headers, proxyHost);
httpResponse = getHttpClient(site, proxy).execute(httpUriRequest); HttpContext httpContext = new BasicHttpContext();
HttpUriRequest httpUriRequest = httpUriRequestConverter.convert(request,site);
AuthState authState = new AuthState();
authState.update(new BasicScheme(), new UsernamePasswordCredentials("userName", "password"));
httpContext.setAttribute(HttpClientContext.PROXY_AUTH_STATE, authState);
CloseableHttpClient httpClient = getHttpClient(site, proxy);
httpResponse = httpClient.execute(httpUriRequest, httpContext);
statusCode = httpResponse.getStatusLine().getStatusCode(); statusCode = httpResponse.getStatusLine().getStatusCode();
request.putExtra(Request.STATUS_CODE, statusCode); request.putExtra(Request.STATUS_CODE, statusCode);
if (statusAccept(acceptStatCode, statusCode)) { if (statusAccept(acceptStatCode, statusCode)) {
@ -134,72 +137,6 @@ public class HttpClientDownloader extends AbstractDownloader {
return acceptStatCode.contains(statusCode); return acceptStatCode.contains(statusCode);
} }
protected HttpUriRequest getHttpUriRequest(Request request, Site site, Map<String, String> headers, HttpHost proxy) {
RequestBuilder requestBuilder = selectRequestMethod(request).setUri(request.getUrl());
if (headers != null) {
for (Map.Entry<String, String> headerEntry : headers.entrySet()) {
requestBuilder.addHeader(headerEntry.getKey(), headerEntry.getValue());
}
}
RequestConfig.Builder requestConfigBuilder = RequestConfig.custom();
if (site != null) {
requestConfigBuilder.setConnectionRequestTimeout(site.getTimeOut())
.setSocketTimeout(site.getTimeOut())
.setConnectTimeout(site.getTimeOut())
.setCookieSpec(CookieSpecs.BEST_MATCH);
}
if (proxy != null) {
requestConfigBuilder.setProxy(proxy);
request.putExtra(Request.PROXY, proxy);
}
requestBuilder.setConfig(requestConfigBuilder.build());
return requestBuilder.build();
}
protected RequestBuilder selectRequestMethod(Request request) {
String method = request.getMethod();
if (method == null || method.equalsIgnoreCase(HttpConstant.Method.GET)) {
//default get
return addQueryParams(RequestBuilder.get(),request.getParams());
} else if (method.equalsIgnoreCase(HttpConstant.Method.POST)) {
return addFormParams(RequestBuilder.post(), (NameValuePair[]) request.getExtra("nameValuePair"), request.getParams());
} else if (method.equalsIgnoreCase(HttpConstant.Method.HEAD)) {
return addQueryParams(RequestBuilder.head(),request.getParams());
} else if (method.equalsIgnoreCase(HttpConstant.Method.PUT)) {
return addFormParams(RequestBuilder.put(), (NameValuePair[]) request.getExtra("nameValuePair"), request.getParams());
} else if (method.equalsIgnoreCase(HttpConstant.Method.DELETE)) {
return addQueryParams(RequestBuilder.delete(),request.getParams());
} else if (method.equalsIgnoreCase(HttpConstant.Method.TRACE)) {
return addQueryParams(RequestBuilder.trace(),request.getParams());
}
throw new IllegalArgumentException("Illegal HTTP Method " + method);
}
private RequestBuilder addFormParams(RequestBuilder requestBuilder, NameValuePair[] nameValuePair, Map<String, String> params) {
List<NameValuePair> allNameValuePair=new ArrayList<NameValuePair>();
if (nameValuePair != null && nameValuePair.length > 0) {
allNameValuePair= Arrays.asList(nameValuePair);
}
if (params != null) {
for (String key : params.keySet()) {
allNameValuePair.add(new BasicNameValuePair(key, params.get(key)));
}
}
requestBuilder.setEntity(new UrlEncodedFormEntity(allNameValuePair, Charset.forName("utf8")));
return requestBuilder;
}
private RequestBuilder addQueryParams(RequestBuilder requestBuilder, Map<String, String> params) {
if (params != null) {
for (Map.Entry<String, String> entry : params.entrySet()) {
requestBuilder.addParameter(entry.getKey(), entry.getValue());
}
}
return requestBuilder;
}
protected Page handleResponse(Request request, String charset, HttpResponse httpResponse, Task task) throws IOException { protected Page handleResponse(Request request, String charset, HttpResponse httpResponse, Task task) throws IOException {
String content = getContent(charset, httpResponse); String content = getContent(charset, httpResponse);
Page page = new Page(); Page page = new Page();

@ -1,13 +1,9 @@
package us.codecraft.webmagic.downloader; package us.codecraft.webmagic.downloader;
import org.apache.commons.lang3.StringUtils;
import org.apache.http.HttpException; import org.apache.http.HttpException;
import org.apache.http.HttpRequest; import org.apache.http.HttpRequest;
import org.apache.http.HttpRequestInterceptor; import org.apache.http.HttpRequestInterceptor;
import org.apache.http.auth.AuthScope;
import org.apache.http.auth.UsernamePasswordCredentials;
import org.apache.http.client.CookieStore; import org.apache.http.client.CookieStore;
import org.apache.http.client.CredentialsProvider;
import org.apache.http.config.Registry; import org.apache.http.config.Registry;
import org.apache.http.config.RegistryBuilder; import org.apache.http.config.RegistryBuilder;
import org.apache.http.config.SocketConfig; import org.apache.http.config.SocketConfig;
@ -21,7 +17,6 @@ import org.apache.http.protocol.HttpContext;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.proxy.Proxy;
import javax.net.ssl.SSLContext; import javax.net.ssl.SSLContext;
import javax.net.ssl.TrustManager; import javax.net.ssl.TrustManager;
@ -92,31 +87,13 @@ public class HttpClientGenerator {
return this; return this;
} }
public CloseableHttpClient getClient(Site site, Proxy proxy) { public CloseableHttpClient getClient(Site site) {
return generateClient(site, proxy); return generateClient(site);
} }
private CloseableHttpClient generateClient(Site site, Proxy proxy) { private CloseableHttpClient generateClient(Site site) {
CredentialsProvider credsProvider = null;
HttpClientBuilder httpClientBuilder = HttpClients.custom(); HttpClientBuilder httpClientBuilder = HttpClients.custom();
if (proxy != null && StringUtils.isNotBlank(proxy.getUser()) && StringUtils.isNotBlank(proxy.getPassword()))
{
credsProvider= new BasicCredentialsProvider();
credsProvider.setCredentials(
new AuthScope(proxy.getHttpHost().getAddress().getHostAddress(), proxy.getHttpHost().getPort()),
new UsernamePasswordCredentials(proxy.getUser(), proxy.getPassword()));
httpClientBuilder.setDefaultCredentialsProvider(credsProvider);
}
if (site != null && site.getHttpProxy()!= null && site.getUsernamePasswordCredentials() != null){
credsProvider = new BasicCredentialsProvider();
credsProvider.setCredentials(
new AuthScope(site.getHttpProxy()),//可以访问的范围
site.getUsernamePasswordCredentials());//用户名和密码
httpClientBuilder.setDefaultCredentialsProvider(credsProvider);
}
httpClientBuilder.setConnectionManager(connectionManager); httpClientBuilder.setConnectionManager(connectionManager);
if (site != null && site.getUserAgent() != null) { if (site != null && site.getUserAgent() != null) {
httpClientBuilder.setUserAgent(site.getUserAgent()); httpClientBuilder.setUserAgent(site.getUserAgent());

@ -0,0 +1,98 @@
package us.codecraft.webmagic.downloader;
import org.apache.http.HttpHost;
import org.apache.http.NameValuePair;
import org.apache.http.client.config.CookieSpecs;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.entity.UrlEncodedFormEntity;
import org.apache.http.client.methods.HttpUriRequest;
import org.apache.http.client.methods.RequestBuilder;
import org.apache.http.message.BasicNameValuePair;
import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.proxy.Proxy;
import us.codecraft.webmagic.utils.HttpConstant;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Map;
/**
* @author code4crafter@gmail.com
* Date: 17/3/18
* Time: 11:28
*/
public class HttpUriRequestConverter {
public HttpUriRequest convert(Request request, Site site, Proxy proxy) {
return null;
}
private HttpUriRequest getHttpUriRequest(Request request, Site site, Map<String, String> headers, HttpHost proxy) {
RequestBuilder requestBuilder = selectRequestMethod(request).setUri(request.getUrl());
if (headers != null) {
for (Map.Entry<String, String> headerEntry : headers.entrySet()) {
requestBuilder.addHeader(headerEntry.getKey(), headerEntry.getValue());
}
}
RequestConfig.Builder requestConfigBuilder = RequestConfig.custom();
if (site != null) {
requestConfigBuilder.setConnectionRequestTimeout(site.getTimeOut())
.setSocketTimeout(site.getTimeOut())
.setConnectTimeout(site.getTimeOut())
.setCookieSpec(CookieSpecs.BEST_MATCH);
}
if (proxy != null) {
requestConfigBuilder.setProxy(proxy);
}
requestBuilder.setConfig(requestConfigBuilder.build());
return requestBuilder.build();
}
private RequestBuilder selectRequestMethod(Request request) {
String method = request.getMethod();
if (method == null || method.equalsIgnoreCase(HttpConstant.Method.GET)) {
//default get
return addQueryParams(RequestBuilder.get(),request.getParams());
} else if (method.equalsIgnoreCase(HttpConstant.Method.POST)) {
return addFormParams(RequestBuilder.post(), (NameValuePair[]) request.getExtra("nameValuePair"), request.getParams());
} else if (method.equalsIgnoreCase(HttpConstant.Method.HEAD)) {
return addQueryParams(RequestBuilder.head(),request.getParams());
} else if (method.equalsIgnoreCase(HttpConstant.Method.PUT)) {
return addFormParams(RequestBuilder.put(), (NameValuePair[]) request.getExtra("nameValuePair"), request.getParams());
} else if (method.equalsIgnoreCase(HttpConstant.Method.DELETE)) {
return addQueryParams(RequestBuilder.delete(),request.getParams());
} else if (method.equalsIgnoreCase(HttpConstant.Method.TRACE)) {
return addQueryParams(RequestBuilder.trace(),request.getParams());
}
throw new IllegalArgumentException("Illegal HTTP Method " + method);
}
private RequestBuilder addFormParams(RequestBuilder requestBuilder, NameValuePair[] nameValuePair, Map<String, String> params) {
List<NameValuePair> allNameValuePair=new ArrayList<NameValuePair>();
if (nameValuePair != null && nameValuePair.length > 0) {
allNameValuePair= Arrays.asList(nameValuePair);
}
if (params != null) {
for (String key : params.keySet()) {
allNameValuePair.add(new BasicNameValuePair(key, params.get(key)));
}
}
requestBuilder.setEntity(new UrlEncodedFormEntity(allNameValuePair, Charset.forName("utf8")));
return requestBuilder;
}
private RequestBuilder addQueryParams(RequestBuilder requestBuilder, Map<String, String> params) {
if (params != null) {
for (Map.Entry<String, String> entry : params.entrySet()) {
requestBuilder.addParameter(entry.getKey(), entry.getValue());
}
}
return requestBuilder;
}
}

@ -1,199 +1,47 @@
package us.codecraft.webmagic.proxy; package us.codecraft.webmagic.proxy;
import org.apache.http.HttpHost;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.List;
import java.util.concurrent.Delayed;
import java.util.concurrent.TimeUnit;
/**
* >>>> Proxy lifecycle
+----------+ +-----+
| last use | | new |
+-----+----+ +---+-+
| +------+ |
+->| init |<--+
+--+---+
|
v
+--------+
+--->| borrow |
| +---+----+
| |+------------------+
| v
| +--------+
| | in use | Respone Time
| +---+----+
| |+------------------+
| v
| +--------+
| | return |
| +---+----+
| |+-------------------+
| v
| +-------+ reuse interval
| | delay | (delay time)
| +---+---+
| |+-------------------+
| v
| +------+
| | idle | idle time
| +---+--+
| |+-------------------+
+--------+
*/
/** /**
* Object has these status of lifecycle above.<br>
* *
* @author yxssfxwzy@sina.com <br>
* @since 0.5.1
* @see SimpleProxyPool
*/ */
public class Proxy implements Delayed, Serializable { public class Proxy {
private static final long serialVersionUID = 228939737383625551L;
public static final int ERROR_403 = 403;
public static final int ERROR_404 = 404;
public static final int ERROR_BANNED = 10000;// banned by website
public static final int ERROR_Proxy = 10001;// the proxy itself failed
public static final int SUCCESS = 200;
private final HttpHost httpHost; private ProxyHost proxyHost;
private String user; private String user;
private String password; private String password;
private int reuseTimeInterval = 1500;// ms
private Long canReuseTime = 0L;
private Long lastBorrowTime = System.currentTimeMillis();
private Long responseTime = 0L;
private int failedNum = 0;
private int successNum = 0;
private int borrowNum = 0;
private List<Integer> failedErrorType = new ArrayList<Integer>();
public Proxy(HttpHost httpHost, String user, String password) {
this.httpHost = httpHost;
this.user = user;
this.password = password;
this.canReuseTime = System.nanoTime() + TimeUnit.NANOSECONDS.convert(reuseTimeInterval, TimeUnit.MILLISECONDS);
}
public Proxy(HttpHost httpHost, int reuseInterval, String user, String password) { public Proxy(ProxyHost proxyHost, String user, String password) {
this.httpHost = httpHost; this.proxyHost = proxyHost;
this.user = user; this.user = user;
this.password = password; this.password = password;
this.canReuseTime = System.nanoTime() + TimeUnit.NANOSECONDS.convert(reuseInterval, TimeUnit.MILLISECONDS);
} }
public int getSuccessNum() { public Proxy(ProxyHost proxyHost) {
return successNum; this.proxyHost = proxyHost;
} }
public void successNumIncrement(int increment) { public ProxyHost getProxyHost() {
this.successNum += increment; return proxyHost;
} }
public Long getLastUseTime() { public void setProxyHost(ProxyHost proxyHost) {
return lastBorrowTime; this.proxyHost = proxyHost;
} }
public void setLastBorrowTime(Long lastBorrowTime) { public String getUser() {
this.lastBorrowTime = lastBorrowTime; return user;
}
public void recordResponse() {
this.responseTime = (System.currentTimeMillis() - lastBorrowTime + responseTime) / 2;
this.lastBorrowTime = System.currentTimeMillis();
}
public List<Integer> getFailedErrorType() {
return failedErrorType;
}
public void setFailedErrorType(List<Integer> failedErrorType) {
this.failedErrorType = failedErrorType;
}
public void fail(int failedErrorType) {
this.failedNum++;
this.failedErrorType.add(failedErrorType);
}
public void setFailedNum(int failedNum) {
this.failedNum = failedNum;
}
public int getFailedNum() {
return failedNum;
}
public String getFailedType() {
String re = "";
for (Integer i : this.failedErrorType) {
re += i + " . ";
}
return re;
}
public HttpHost getHttpHost() {
return httpHost;
}
public int getReuseTimeInterval() {
return reuseTimeInterval;
}
public void setReuseTimeInterval(int reuseTimeInterval) {
this.reuseTimeInterval = reuseTimeInterval;
this.canReuseTime = System.nanoTime() + TimeUnit.NANOSECONDS.convert(reuseTimeInterval, TimeUnit.MILLISECONDS);
}
@Override
public long getDelay(TimeUnit unit) {
return unit.convert(canReuseTime - System.nanoTime(), TimeUnit.NANOSECONDS);
} }
@Override public void setUser(String user) {
public int compareTo(Delayed o) { this.user = user;
Proxy that = (Proxy) o;
return canReuseTime > that.canReuseTime ? 1 : (canReuseTime < that.canReuseTime ? -1 : 0);
} }
@Override public String getPassword() {
public String toString() {
String re = String.format("host: %15s >> %5dms >> success: %-3.2f%% >> borrow: %d", httpHost.getAddress().getHostAddress(), responseTime,
successNum * 100.0 / borrowNum, borrowNum);
return re;
}
public String getUser()
{
return user;
}
public String getPassword()
{
return password; return password;
} }
public void borrowNumIncrement(int increment) { public void setPassword(String password) {
this.borrowNum += increment; this.password = password;
} }
public int getBorrowNum() {
return borrowNum;
}
} }

@ -0,0 +1,34 @@
package us.codecraft.webmagic.proxy;
/**
* @author code4crafter@gmail.com
* Date: 17/3/18
* Time: 12:04
*/
public class ProxyHost {
private String host;
private int port;
public String getHost() {
return host;
}
public ProxyHost(String host, int port) {
this.host = host;
this.port = port;
}
public void setHost(String host) {
this.host = host;
}
public int getPort() {
return port;
}
public void setPort(int port) {
this.port = port;
}
}

@ -6,7 +6,10 @@ import org.apache.http.HttpHost;
* Created by edwardsbean on 15-2-28. * Created by edwardsbean on 15-2-28.
*/ */
public interface ProxyPool { public interface ProxyPool {
public void returnProxy(HttpHost host, int statusCode);
public Proxy getProxy(); void returnProxy(HttpHost host, int statusCode);
public boolean isEnable();
Proxy getProxy();
boolean isEnable();
} }

@ -0,0 +1,163 @@
package us.codecraft.webmagic.proxy;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.List;
import java.util.concurrent.Delayed;
import java.util.concurrent.TimeUnit;
/**
* >>>> Proxy lifecycle
+----------+ +-----+
| last use | | new |
+-----+----+ +---+-+
| +------+ |
+->| init |<--+
+--+---+
|
v
+--------+
+--->| borrow |
| +---+----+
| |+------------------+
| v
| +--------+
| | in use | Respone Time
| +---+----+
| |+------------------+
| v
| +--------+
| | return |
| +---+----+
| |+-------------------+
| v
| +-------+ reuse interval
| | delay | (delay time)
| +---+---+
| |+-------------------+
| v
| +------+
| | idle | idle time
| +---+--+
| |+-------------------+
+--------+
*/
/**
* Object has these status of lifecycle above.<br>
*
* @author yxssfxwzy@sina.com <br>
* @since 0.5.1
* @see TimerReuseProxyPool
*/
public class TimerReuseProxy extends Proxy implements Delayed, Serializable {
private static final long serialVersionUID = 228939737383625551L;
public static final int ERROR_403 = 403;
public static final int ERROR_404 = 404;
public static final int ERROR_BANNED = 10000;// banned by website
public static final int ERROR_Proxy = 10001;// the proxy itself failed
public static final int SUCCESS = 200;
private int reuseTimeInterval = 1500;// ms
private Long canReuseTime = 0L;
private Long lastBorrowTime = System.currentTimeMillis();
private Long responseTime = 0L;
private int failedNum = 0;
private int successNum = 0;
private int borrowNum = 0;
private List<Integer> failedErrorType = new ArrayList<Integer>();
public TimerReuseProxy(ProxyHost proxyHost, String user, String password) {
super(proxyHost, user, password);
}
public TimerReuseProxy(ProxyHost proxyHost, String user, String password, int reuseTimeInterval) {
super(proxyHost, user, password);
this.reuseTimeInterval = reuseTimeInterval;
}
public int getSuccessNum() {
return successNum;
}
public void successNumIncrement(int increment) {
this.successNum += increment;
}
public Long getLastUseTime() {
return lastBorrowTime;
}
public void setLastBorrowTime(Long lastBorrowTime) {
this.lastBorrowTime = lastBorrowTime;
}
public void recordResponse() {
this.responseTime = (System.currentTimeMillis() - lastBorrowTime + responseTime) / 2;
this.lastBorrowTime = System.currentTimeMillis();
}
public List<Integer> getFailedErrorType() {
return failedErrorType;
}
public void setFailedErrorType(List<Integer> failedErrorType) {
this.failedErrorType = failedErrorType;
}
public void fail(int failedErrorType) {
this.failedNum++;
this.failedErrorType.add(failedErrorType);
}
public void setFailedNum(int failedNum) {
this.failedNum = failedNum;
}
public int getFailedNum() {
return failedNum;
}
public String getFailedType() {
String re = "";
for (Integer i : this.failedErrorType) {
re += i + " . ";
}
return re;
}
public int getReuseTimeInterval() {
return reuseTimeInterval;
}
public void setReuseTimeInterval(int reuseTimeInterval) {
this.reuseTimeInterval = reuseTimeInterval;
this.canReuseTime = System.nanoTime() + TimeUnit.NANOSECONDS.convert(reuseTimeInterval, TimeUnit.MILLISECONDS);
}
@Override
public long getDelay(TimeUnit unit) {
return unit.convert(canReuseTime - System.nanoTime(), TimeUnit.NANOSECONDS);
}
@Override
public int compareTo(Delayed o) {
TimerReuseProxy that = (TimerReuseProxy) o;
return canReuseTime > that.canReuseTime ? 1 : (canReuseTime < that.canReuseTime ? -1 : 0);
}
public void borrowNumIncrement(int increment) {
this.borrowNum += increment;
}
public int getBorrowNum() {
return borrowNum;
}
}

@ -22,12 +22,12 @@ import java.util.concurrent.DelayQueue;
* @see Proxy * @see Proxy
* @since 0.5.1 * @since 0.5.1
*/ */
public class SimpleProxyPool implements ProxyPool { public class TimerReuseProxyPool implements ProxyPool {
private Logger logger = LoggerFactory.getLogger(getClass()); private Logger logger = LoggerFactory.getLogger(getClass());
private BlockingQueue<Proxy> proxyQueue = new DelayQueue<Proxy>(); private BlockingQueue<TimerReuseProxy> proxyQueue = new DelayQueue<TimerReuseProxy>();
private Map<String, Proxy> allProxy = new ConcurrentHashMap<String, Proxy>(); private Map<String, TimerReuseProxy> allProxy = new ConcurrentHashMap<String, TimerReuseProxy>();
private int reuseInterval = 1500;// ms private int reuseInterval = 1500;// ms
private int reviveTime = 2 * 60 * 60 * 1000;// ms private int reviveTime = 2 * 60 * 60 * 1000;// ms
@ -50,15 +50,15 @@ public class SimpleProxyPool implements ProxyPool {
} }
}; };
public SimpleProxyPool() { public TimerReuseProxyPool() {
this(null, true); this(null, true);
} }
public SimpleProxyPool(List<String[]> httpProxyList) { public TimerReuseProxyPool(List<String[]> httpProxyList) {
this(httpProxyList, true); this(httpProxyList, true);
} }
public SimpleProxyPool(List<String[]> httpProxyList, boolean isUseLastProxy) { public TimerReuseProxyPool(List<String[]> httpProxyList, boolean isUseLastProxy) {
if (httpProxyList != null) { if (httpProxyList != null) {
addProxy(httpProxyList.toArray(new String[httpProxyList.size()][])); addProxy(httpProxyList.toArray(new String[httpProxyList.size()][]));
} }
@ -109,9 +109,9 @@ public class SimpleProxyPool implements ProxyPool {
} }
private Map<String, Proxy> prepareForSaving() { private Map<String, Proxy> prepareForSaving() {
Map<String, Proxy> tmp = new HashMap<String, Proxy>(); Map<String, TimerReuseProxy> tmp = new HashMap<String, TimerReuseProxy>();
for (Entry<String, Proxy> e : allProxy.entrySet()) { for (Entry<String, TimerReuseProxy> e : allProxy.entrySet()) {
Proxy p = e.getValue(); TimerReuseProxy p = e.getValue();
p.setFailedNum(0); p.setFailedNum(0);
tmp.put(e.getKey(), p); tmp.put(e.getKey(), p);
} }
@ -152,30 +152,20 @@ public class SimpleProxyPool implements ProxyPool {
logger.info("proxy pool size>>>>" + allProxy.size()); logger.info("proxy pool size>>>>" + allProxy.size());
} }
public void addProxy(String[]... httpProxyList) { public void addProxy(Proxy... httpProxyList) {
isEnable = true; isEnable = true;
for (String[] s : httpProxyList) { for (Proxy proxy : httpProxyList) {
try { if (!validateWhenInit || ProxyUtils.validateProxy(proxy.getProxyHost())) {
if (allProxy.containsKey(s[2])) { TimerReuseProxy p = new TimerReuseProxy(proxy.getProxyHost(), proxy.getUser(), proxy.getPassword(), reuseInterval);
continue; proxyQueue.add(p);
} allProxy.put(p.getProxyHost().getHost(), p);
HttpHost item = new HttpHost(InetAddress.getByName(s[2]), Integer.valueOf(s[3]));
if (!validateWhenInit || ProxyUtils.validateProxy(item)) {
Proxy p = new Proxy(item, reuseInterval, s[0], s[1]);
proxyQueue.add(p);
allProxy.put(s[2], p);
}
} catch (NumberFormatException e) {
logger.error("HttpHost init error:", e);
} catch (UnknownHostException e) {
logger.error("HttpHost init error:", e);
} }
} }
logger.info("proxy pool size>>>>" + allProxy.size()); logger.info("proxy pool size>>>>" + allProxy.size());
} }
public Proxy getProxy() { public TimerReuseProxy getProxy() {
Proxy proxy = null; TimerReuseProxy proxy = null;
try { try {
Long time = System.currentTimeMillis(); Long time = System.currentTimeMillis();
proxy = proxyQueue.take(); proxy = proxyQueue.take();
@ -183,7 +173,7 @@ public class SimpleProxyPool implements ProxyPool {
if (costTime > reuseInterval) { if (costTime > reuseInterval) {
logger.info("get proxy time >>>> " + costTime); logger.info("get proxy time >>>> " + costTime);
} }
Proxy p = allProxy.get(proxy.getHttpHost().getAddress().getHostAddress()); TimerReuseProxy p = allProxy.get(proxy.getProxyHost().getHost());
p.setLastBorrowTime(System.currentTimeMillis()); p.setLastBorrowTime(System.currentTimeMillis());
p.borrowNumIncrement(1); p.borrowNumIncrement(1);
} catch (InterruptedException e) { } catch (InterruptedException e) {

@ -1,19 +1,14 @@
package us.codecraft.webmagic.utils; package us.codecraft.webmagic.utils;
import java.io.IOException;
import java.net.Inet6Address;
import java.net.InetAddress;
import java.net.InetSocketAddress;
import java.net.NetworkInterface;
import java.net.Socket;
import java.net.SocketException;
import java.net.UnknownHostException;
import java.util.Enumeration;
import java.util.regex.Pattern;
import org.apache.http.HttpHost; import org.apache.http.HttpHost;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
import us.codecraft.webmagic.proxy.ProxyHost;
import java.io.IOException;
import java.net.*;
import java.util.Enumeration;
import java.util.regex.Pattern;
/** /**
* Pooled Proxy Object * Pooled Proxy Object
@ -69,7 +64,11 @@ public class ProxyUtils {
} }
} }
public static boolean validateProxy(HttpHost p) { public static HttpHost convert(ProxyHost p){
return new HttpHost(p.getHost(),p.getPort());
}
public static boolean validateProxy(ProxyHost p) {
if (localAddr == null) { if (localAddr == null) {
logger.error("cannot get local IP"); logger.error("cannot get local IP");
return false; return false;
@ -79,7 +78,7 @@ public class ProxyUtils {
try { try {
socket = new Socket(); socket = new Socket();
socket.bind(new InetSocketAddress(localAddr, 0)); socket.bind(new InetSocketAddress(localAddr, 0));
InetSocketAddress endpointSocketAddr = new InetSocketAddress(p.getAddress().getHostAddress(), p.getPort()); InetSocketAddress endpointSocketAddr = new InetSocketAddress(p.getHost(), p.getPort());
socket.connect(endpointSocketAddr, 3000); socket.connect(endpointSocketAddr, 3000);
logger.debug("SUCCESS - connection established! Local: " + localAddr.getHostAddress() + " remote: " + p); logger.debug("SUCCESS - connection established! Local: " + localAddr.getHostAddress() + " remote: " + p);
isReachable = true; isReachable = true;

@ -29,7 +29,7 @@ public class ProxyTest {
@Test @Test
public void testProxy() { public void testProxy() {
SimpleProxyPool proxyPool = new SimpleProxyPool(httpProxyList,false); TimerReuseProxyPool proxyPool = new TimerReuseProxyPool(httpProxyList,false);
proxyPool.setReuseInterval(500); proxyPool.setReuseInterval(500);
assertThat(proxyPool.getIdleNum()).isEqualTo(4); assertThat(proxyPool.getIdleNum()).isEqualTo(4);
for (int i = 0; i < 2; i++) { for (int i = 0; i < 2; i++) {

Loading…
Cancel
Save