diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java index d342069f..87eab14c 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java @@ -2,8 +2,7 @@ package us.codecraft.webmagic; import org.apache.http.HttpHost; import org.apache.http.auth.UsernamePasswordCredentials; -import us.codecraft.webmagic.proxy.ProxyPool; -import us.codecraft.webmagic.proxy.TimerReuseProxyPool; +import us.codecraft.webmagic.proxy.ProxyProvider; import us.codecraft.webmagic.utils.UrlUtils; import java.util.*; @@ -52,7 +51,7 @@ public class Site { private UsernamePasswordCredentials usernamePasswordCredentials; //代理用户名密码设置 - private ProxyPool httpProxyPool; + private ProxyProvider httpProxyPool; private boolean useGzip = true; @@ -399,7 +398,11 @@ public class Site { return new Task() { @Override public String getUUID() { - return Site.this.getDomain(); + String uuid = Site.this.getDomain(); + if (uuid == null) { + uuid = UUID.randomUUID().toString(); + } + return uuid; } @Override @@ -467,45 +470,4 @@ public class Site { '}'; } - /** - * Set httpProxyPool, String[0]:ip, String[1]:port
- * - * @param proxyPool proxyPool - * @return this - */ - public Site setHttpProxyPool(ProxyPool proxyPool) { - this.httpProxyPool = proxyPool; - return this; - } - - /** - * Set httpProxyPool, String[0]:ip, String[1]:port
- * - * @param httpProxyList httpProxyList - * @param isUseLastProxy isUseLastProxy - * @return this - */ - public Site setHttpProxyPool(List httpProxyList, boolean isUseLastProxy) { - this.httpProxyPool=new TimerReuseProxyPool(httpProxyList, isUseLastProxy); - return this; - } - - public Site enableHttpProxyPool() { - this.httpProxyPool=new TimerReuseProxyPool(); - return this; - } - - public UsernamePasswordCredentials getUsernamePasswordCredentials() { - return usernamePasswordCredentials; - } - - public Site setUsernamePasswordCredentials(UsernamePasswordCredentials usernamePasswordCredentials) { - this.usernamePasswordCredentials = usernamePasswordCredentials; - return this; - } - - public ProxyPool getHttpProxyPool() { - return httpProxyPool; - } - } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java index 93a8a7ce..3a44af65 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java @@ -20,6 +20,7 @@ import us.codecraft.webmagic.Request; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Task; import us.codecraft.webmagic.proxy.Proxy; +import us.codecraft.webmagic.proxy.ProxyProvider; import us.codecraft.webmagic.selector.PlainText; import us.codecraft.webmagic.utils.CharsetUtils; @@ -45,11 +46,17 @@ public class HttpClientDownloader extends AbstractDownloader { private HttpClientGenerator httpClientGenerator = new HttpClientGenerator(); private HttpUriRequestConverter httpUriRequestConverter = new HttpUriRequestConverter(); + + private ProxyProvider proxyProvider; public void setHttpUriRequestConverter(HttpUriRequestConverter httpUriRequestConverter) { this.httpUriRequestConverter = httpUriRequestConverter; } + public void setProxyProvider(ProxyProvider proxyProvider) { + this.proxyProvider = proxyProvider; + } + private CloseableHttpClient getHttpClient(Site site) { if (site == null) { return httpClientGenerator.getClient(null); @@ -79,8 +86,8 @@ public class HttpClientDownloader extends AbstractDownloader { Site site = task.getSite(); Proxy proxy = null; HttpContext httpContext = new BasicHttpContext(); - if (site.getHttpProxyPool() != null) { - proxy = site.getHttpProxyPool().getProxy(task); + if (proxyProvider != null) { + proxy = proxyProvider.getProxy(task); request.putExtra(Request.PROXY, proxy); AuthState authState = new AuthState(); authState.update(new BasicScheme(), new UsernamePasswordCredentials(proxy.getUsername(), proxy.getPassword())); @@ -111,9 +118,6 @@ public class HttpClientDownloader extends AbstractDownloader { //ensure the connection is released back to pool EntityUtils.consumeQuietly(httpResponse.getEntity()); } - if (proxy != null) { - site.getHttpProxyPool().returnProxy(proxy, statusCode, task); - } } } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpUriRequestConverter.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpUriRequestConverter.java index 0ec4b0e9..951d3323 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpUriRequestConverter.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpUriRequestConverter.java @@ -43,7 +43,7 @@ public class HttpUriRequestConverter { } if (proxy != null) { - requestConfigBuilder.setProxy(new HttpHost(proxy.getProxyHost().getHost(), proxy.getProxyHost().getPort())); + requestConfigBuilder.setProxy(new HttpHost(proxy.getHost(), proxy.getPort())); } requestBuilder.setConfig(requestConfigBuilder.build()); return requestBuilder.build(); diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/Proxy.java b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/Proxy.java index 1d872d43..a38ccaa7 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/Proxy.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/Proxy.java @@ -6,42 +6,36 @@ package us.codecraft.webmagic.proxy; public class Proxy { - private ProxyHost proxyHost; + private String host; + private int port; private String username; private String password; - public Proxy(ProxyHost proxyHost, String username, String password) { - this.proxyHost = proxyHost; - this.username = username; - this.password = password; + public Proxy(String host, int port) { + this.host = host; + this.port = port; } - public Proxy(ProxyHost proxyHost) { - this.proxyHost = proxyHost; + public Proxy(String host, int port, String username, String password) { + this.host = host; + this.port = port; + this.username = username; + this.password = password; } - public ProxyHost getProxyHost() { - return proxyHost; + public String getHost() { + return host; } - public void setProxyHost(ProxyHost proxyHost) { - this.proxyHost = proxyHost; + public int getPort() { + return port; } public String getUsername() { return username; } - public void setUsername(String username) { - this.username = username; - } - public String getPassword() { return password; } - - public void setPassword(String password) { - this.password = password; - } - } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ProxyHost.java b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ProxyHost.java deleted file mode 100644 index 11e8c87b..00000000 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ProxyHost.java +++ /dev/null @@ -1,34 +0,0 @@ -package us.codecraft.webmagic.proxy; - -/** - * @author code4crafter@gmail.com - * Date: 17/3/18 - * Time: 下午12:04 - */ -public class ProxyHost { - - private String host; - - private int port; - - public String getHost() { - return host; - } - - public ProxyHost(String host, int port) { - this.host = host; - this.port = port; - } - - public void setHost(String host) { - this.host = host; - } - - public int getPort() { - return port; - } - - public void setPort(int port) { - this.port = port; - } -} diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ProxyPool.java b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ProxyProvider.java similarity index 87% rename from webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ProxyPool.java rename to webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ProxyProvider.java index fcc1f8df..4266d78c 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ProxyPool.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ProxyProvider.java @@ -5,7 +5,7 @@ import us.codecraft.webmagic.Task; /** * Created by edwardsbean on 15-2-28. */ -public interface ProxyPool { +public interface ProxyProvider { void returnProxy(Proxy proxy, boolean banned, Task task); diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/TimerReuseProxy.java b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/TimerReuseProxy.java index 8f592527..7002df47 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/TimerReuseProxy.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/TimerReuseProxy.java @@ -72,14 +72,10 @@ public class TimerReuseProxy extends Proxy implements Delayed, Serializable { private List failedErrorType = new ArrayList(); - public TimerReuseProxy(ProxyHost proxyHost, String user, String password) { - super(proxyHost, user, password); + public TimerReuseProxy(String host, int port, String username, String password) { + super(host, port, username, password); } - public TimerReuseProxy(ProxyHost proxyHost, String user, String password, int reuseTimeInterval) { - super(proxyHost, user, password); - this.reuseTimeInterval = reuseTimeInterval; - } public int getSuccessNum() { return successNum; diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/TimerReuseProxyPool.java b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/TimerReuseProxyPool.java index 6fde6047..6dbac5d5 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/TimerReuseProxyPool.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/TimerReuseProxyPool.java @@ -1,17 +1,6 @@ package us.codecraft.webmagic.proxy; -import org.apache.http.HttpHost; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; -import us.codecraft.webmagic.utils.FilePersistentBase; -import us.codecraft.webmagic.utils.ProxyUtils; - -import java.io.*; -import java.util.*; -import java.util.Map.Entry; -import java.util.concurrent.BlockingQueue; -import java.util.concurrent.ConcurrentHashMap; -import java.util.concurrent.DelayQueue; +import us.codecraft.webmagic.Task; /** * Pooled Proxy Object @@ -20,187 +9,196 @@ import java.util.concurrent.DelayQueue; * @see Proxy * @since 0.5.1 */ -public class TimerReuseProxyPool implements ProxyPool { - - private Logger logger = LoggerFactory.getLogger(getClass()); - - private BlockingQueue proxyQueue = new DelayQueue(); - private Map allProxy = new ConcurrentHashMap(); - - private int reuseInterval = 1500;// ms - private int reviveTime = 2 * 60 * 60 * 1000;// ms - private int saveProxyInterval = 10 * 60 * 1000;// ms - - private boolean isEnable = false; - private boolean validateWhenInit = false; - // private boolean isUseLastProxy = true; - - public TimerReuseProxyPool(List httpProxyList) { - this(httpProxyList, true); - } - - private void addProxy(Map httpProxyMap) { - isEnable = true; - for (Entry entry : httpProxyMap.entrySet()) { - try { - if (allProxy.containsKey(entry.getKey())) { - continue; - } - if (!validateWhenInit || ProxyUtils.validateProxy(entry.getValue().getHttpHost())) { - entry.getValue().setFailedNum(0); - entry.getValue().setReuseTimeInterval(reuseInterval); - proxyQueue.add(entry.getValue()); - allProxy.put(entry.getKey(), entry.getValue()); - } - } catch (NumberFormatException e) { - logger.error("HttpHost init error:", e); - } - } - logger.info("proxy pool size>>>>" + allProxy.size()); - } - - public void addProxy(Proxy... httpProxyList) { - isEnable = true; - for (Proxy proxy : httpProxyList) { - if (!validateWhenInit || ProxyUtils.validateProxy(proxy.getProxyHost())) { - TimerReuseProxy p = new TimerReuseProxy(proxy.getProxyHost(), proxy.getUsername(), proxy.getPassword(), reuseInterval); - proxyQueue.add(p); - allProxy.put(p.getProxyHost().getHost(), p); - } - } - logger.info("proxy pool size>>>>" + allProxy.size()); - } - - public TimerReuseProxy getProxy() { - TimerReuseProxy proxy = null; - try { - Long time = System.currentTimeMillis(); - proxy = proxyQueue.take(); - double costTime = (System.currentTimeMillis() - time) / 1000.0; - if (costTime > reuseInterval) { - logger.info("get proxy time >>>> " + costTime); - } - TimerReuseProxy p = allProxy.get(proxy.getProxyHost().getHost()); - p.setLastBorrowTime(System.currentTimeMillis()); - p.borrowNumIncrement(1); - } catch (InterruptedException e) { - logger.error("get proxy error", e); - } - if (proxy == null) { - throw new NoSuchElementException(); - } - return proxy; - } - - public void returnProxy(Proxy proxy, int statusCode) { - TimerReuseProxy p = allProxy.get(proxy.getProxyHost()); - if (p == null) { - return; - } - switch (statusCode) { - case TimerReuseProxy.SUCCESS: - p.setReuseTimeInterval(reuseInterval); - p.setFailedNum(0); - p.setFailedErrorType(new ArrayList()); - p.recordResponse(); - p.successNumIncrement(1); - break; - case TimerReuseProxy.ERROR_403: - // banned,try longer interval - p.fail(TimerReuseProxy.ERROR_403); - p.setReuseTimeInterval(reuseInterval * p.getFailedNum()); - logger.info(proxy + " >>>> reuseTimeInterval is >>>> " + p.getReuseTimeInterval() / 1000.0); - break; - case TimerReuseProxy.ERROR_BANNED: - p.fail(TimerReuseProxy.ERROR_BANNED); - p.setReuseTimeInterval(10 * 60 * 1000 * p.getFailedNum()); - logger.info(proxy + " >>>> reuseTimeInterval is >>>> " + p.getReuseTimeInterval() / 1000.0); - break; - case TimerReuseProxy.ERROR_404: - // p.fail(Proxy.ERROR_404); - // p.setReuseTimeInterval(reuseInterval * p.getFailedNum()); - break; - default: - p.fail(statusCode); - break; - } - if (p.getFailedNum() > 20) { - p.setReuseTimeInterval(reviveTime); - logger.error("remove proxy >>>> " + proxy + ">>>>" + p.getFailedType() + " >>>> remain proxy >>>> " + proxyQueue.size()); - return; - } - if (p.getFailedNum() > 0 && p.getFailedNum() % 5 == 0) { - if (!ProxyUtils.validateProxy(proxy)) { - p.setReuseTimeInterval(reviveTime); - logger.error("remove proxy >>>> " + proxy + ">>>>" + p.getFailedType() + " >>>> remain proxy >>>> " + proxyQueue.size()); - return; - } - } - try { - proxyQueue.put(p); - } catch (InterruptedException e) { - logger.warn("proxyQueue return proxy error", e); - } - } - - public String allProxyStatus() { - String re = "all proxy info >>>> \n"; - for (Entry entry : allProxy.entrySet()) { - re += entry.getValue().toString() + "\n"; - } - return re; - } - - public int getIdleNum() { - return proxyQueue.size(); - } - - public int getReuseInterval() { - return reuseInterval; - } - - public void setReuseInterval(int reuseInterval) { - this.reuseInterval = reuseInterval; - } - - public void enable(boolean isEnable) { - this.isEnable = isEnable; - } - - public boolean isEnable() { - return isEnable; - } - - public int getReviveTime() { - return reviveTime; - } - - public void setReviveTime(int reviveTime) { - this.reviveTime = reviveTime; - } - - public boolean isValidateWhenInit() { - return validateWhenInit; - } - - public void validateWhenInit(boolean validateWhenInit) { - this.validateWhenInit = validateWhenInit; - } - - public int getSaveProxyInterval() { - return saveProxyInterval; - } - - public void setSaveProxyInterval(int saveProxyInterval) { - this.saveProxyInterval = saveProxyInterval; - } - - public String getProxyFilePath() { - return proxyFilePath; - } - - public void setProxyFilePath(String proxyFilePath) { - this.proxyFilePath = proxyFilePath; - } +public class TimerReuseProxyPool implements ProxyProvider { + @Override + public void returnProxy(Proxy proxy, boolean banned, Task task) { + + } + + @Override + public Proxy getProxy(Task task) { + return null; + } + +// private Logger logger = LoggerFactory.getLogger(getClass()); +// +// private BlockingQueue proxyQueue = new DelayQueue(); +// private Map allProxy = new ConcurrentHashMap(); +// +// private int reuseInterval = 1500;// ms +// private int reviveTime = 2 * 60 * 60 * 1000;// ms +// private int saveProxyInterval = 10 * 60 * 1000;// ms +// +// private boolean isEnable = false; +// private boolean validateWhenInit = false; +// // private boolean isUseLastProxy = true; +// +// public TimerReuseProxyPool(List httpProxyList) { +// this(httpProxyList, true); +// } +// +// private void addProxy(Map httpProxyMap) { +// isEnable = true; +// for (Entry entry : httpProxyMap.entrySet()) { +// try { +// if (allProxy.containsKey(entry.getKey())) { +// continue; +// } +// if (!validateWhenInit || ProxyUtils.validateProxy(entry.getValue().getHttpHost())) { +// entry.getValue().setFailedNum(0); +// entry.getValue().setReuseTimeInterval(reuseInterval); +// proxyQueue.add(entry.getValue()); +// allProxy.put(entry.getKey(), entry.getValue()); +// } +// } catch (NumberFormatException e) { +// logger.error("HttpHost init error:", e); +// } +// } +// logger.info("proxy pool size>>>>" + allProxy.size()); +// } +// +// public void addProxy(Proxy... httpProxyList) { +// isEnable = true; +// for (Proxy proxy : httpProxyList) { +// if (!validateWhenInit || ProxyUtils.validateProxy(proxy.getProxyHost())) { +// TimerReuseProxy p = new TimerReuseProxy(proxy.getProxyHost(), proxy.getUsername(), proxy.getPassword(), reuseInterval); +// proxyQueue.add(p); +// allProxy.put(p.getProxyHost().getHost(), p); +// } +// } +// logger.info("proxy pool size>>>>" + allProxy.size()); +// } +// +// public TimerReuseProxy getProxy() { +// TimerReuseProxy proxy = null; +// try { +// Long time = System.currentTimeMillis(); +// proxy = proxyQueue.take(); +// double costTime = (System.currentTimeMillis() - time) / 1000.0; +// if (costTime > reuseInterval) { +// logger.info("get proxy time >>>> " + costTime); +// } +// TimerReuseProxy p = allProxy.get(proxy.getProxyHost().getHost()); +// p.setLastBorrowTime(System.currentTimeMillis()); +// p.borrowNumIncrement(1); +// } catch (InterruptedException e) { +// logger.error("get proxy error", e); +// } +// if (proxy == null) { +// throw new NoSuchElementException(); +// } +// return proxy; +// } +// +// public void returnProxy(Proxy proxy, int statusCode) { +// TimerReuseProxy p = allProxy.get(proxy.getProxyHost()); +// if (p == null) { +// return; +// } +// switch (statusCode) { +// case TimerReuseProxy.SUCCESS: +// p.setReuseTimeInterval(reuseInterval); +// p.setFailedNum(0); +// p.setFailedErrorType(new ArrayList()); +// p.recordResponse(); +// p.successNumIncrement(1); +// break; +// case TimerReuseProxy.ERROR_403: +// // banned,try longer interval +// p.fail(TimerReuseProxy.ERROR_403); +// p.setReuseTimeInterval(reuseInterval * p.getFailedNum()); +// logger.info(proxy + " >>>> reuseTimeInterval is >>>> " + p.getReuseTimeInterval() / 1000.0); +// break; +// case TimerReuseProxy.ERROR_BANNED: +// p.fail(TimerReuseProxy.ERROR_BANNED); +// p.setReuseTimeInterval(10 * 60 * 1000 * p.getFailedNum()); +// logger.info(proxy + " >>>> reuseTimeInterval is >>>> " + p.getReuseTimeInterval() / 1000.0); +// break; +// case TimerReuseProxy.ERROR_404: +// // p.fail(Proxy.ERROR_404); +// // p.setReuseTimeInterval(reuseInterval * p.getFailedNum()); +// break; +// default: +// p.fail(statusCode); +// break; +// } +// if (p.getFailedNum() > 20) { +// p.setReuseTimeInterval(reviveTime); +// logger.error("remove proxy >>>> " + proxy + ">>>>" + p.getFailedType() + " >>>> remain proxy >>>> " + proxyQueue.size()); +// return; +// } +// if (p.getFailedNum() > 0 && p.getFailedNum() % 5 == 0) { +// if (!ProxyUtils.validateProxy(proxy)) { +// p.setReuseTimeInterval(reviveTime); +// logger.error("remove proxy >>>> " + proxy + ">>>>" + p.getFailedType() + " >>>> remain proxy >>>> " + proxyQueue.size()); +// return; +// } +// } +// try { +// proxyQueue.put(p); +// } catch (InterruptedException e) { +// logger.warn("proxyQueue return proxy error", e); +// } +// } +// +// public String allProxyStatus() { +// String re = "all proxy info >>>> \n"; +// for (Entry entry : allProxy.entrySet()) { +// re += entry.getValue().toString() + "\n"; +// } +// return re; +// } +// +// public int getIdleNum() { +// return proxyQueue.size(); +// } +// +// public int getReuseInterval() { +// return reuseInterval; +// } +// +// public void setReuseInterval(int reuseInterval) { +// this.reuseInterval = reuseInterval; +// } +// +// public void enable(boolean isEnable) { +// this.isEnable = isEnable; +// } +// +// public boolean isEnable() { +// return isEnable; +// } +// +// public int getReviveTime() { +// return reviveTime; +// } +// +// public void setReviveTime(int reviveTime) { +// this.reviveTime = reviveTime; +// } +// +// public boolean isValidateWhenInit() { +// return validateWhenInit; +// } +// +// public void validateWhenInit(boolean validateWhenInit) { +// this.validateWhenInit = validateWhenInit; +// } +// +// public int getSaveProxyInterval() { +// return saveProxyInterval; +// } +// +// public void setSaveProxyInterval(int saveProxyInterval) { +// this.saveProxyInterval = saveProxyInterval; +// } +// +// public String getProxyFilePath() { +// return proxyFilePath; +// } +// +// public void setProxyFilePath(String proxyFilePath) { +// this.proxyFilePath = proxyFilePath; +// } } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/utils/ProxyUtils.java b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/ProxyUtils.java index f9f9a8c0..9b734c73 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/utils/ProxyUtils.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/ProxyUtils.java @@ -1,14 +1,12 @@ package us.codecraft.webmagic.utils; -import org.apache.http.HttpHost; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import us.codecraft.webmagic.proxy.ProxyHost; +import us.codecraft.webmagic.proxy.Proxy; import java.io.IOException; -import java.net.*; -import java.util.Enumeration; -import java.util.regex.Pattern; +import java.net.InetSocketAddress; +import java.net.Socket; /** * Pooled Proxy Object @@ -18,72 +16,19 @@ import java.util.regex.Pattern; */ public class ProxyUtils { - private static InetAddress localAddr; - private static String networkInterface = "eth7"; private static final Logger logger = LoggerFactory.getLogger(ProxyUtils.class); - static { - init(); - } - - private static void init() { - // first way to get local IP - try { - localAddr = InetAddress.getLocalHost(); - logger.info("local IP:" + localAddr.getHostAddress()); - } catch (UnknownHostException e) { - logger.info("try again\n"); - } - if (localAddr != null) { - return; - } - // other way to get local IP - Enumeration localAddrs; - try { - // modify your network interface name - NetworkInterface ni = NetworkInterface.getByName(networkInterface); - if (ni == null) { - return; - } - localAddrs = ni.getInetAddresses(); - if (localAddrs == null || !localAddrs.hasMoreElements()) { - logger.error("choose NetworkInterface\n" + getNetworkInterface()); - return; - } - while (localAddrs.hasMoreElements()) { - InetAddress tmp = localAddrs.nextElement(); - if (!tmp.isLoopbackAddress() && !tmp.isLinkLocalAddress() && !(tmp instanceof Inet6Address)) { - localAddr = tmp; - logger.info("local IP:" + localAddr.getHostAddress()); - break; - } - } - } catch (Exception e) { - logger.error("Failure when init ProxyUtil", e); - logger.error("choose NetworkInterface\n" + getNetworkInterface()); - } - } - - public static HttpHost convert(ProxyHost p){ - return new HttpHost(p.getHost(),p.getPort()); - } - public static boolean validateProxy(ProxyHost p) { - if (localAddr == null) { - logger.error("cannot get local IP"); - return false; - } - boolean isReachable = false; + public static boolean validateProxy(Proxy p) { Socket socket = null; try { socket = new Socket(); - socket.bind(new InetSocketAddress(localAddr, 0)); InetSocketAddress endpointSocketAddr = new InetSocketAddress(p.getHost(), p.getPort()); socket.connect(endpointSocketAddr, 3000); - logger.debug("SUCCESS - connection established! Local: " + localAddr.getHostAddress() + " remote: " + p); - isReachable = true; + return true; } catch (IOException e) { - logger.warn("FAILRE - CAN not connect! Local: " + localAddr.getHostAddress() + " remote: " + p); + logger.warn("FAILRE - CAN not connect! remote: " + p); + return false; } finally { if (socket != null) { try { @@ -93,30 +38,7 @@ public class ProxyUtils { } } } - return isReachable; - } - - private static String getNetworkInterface() { - String networkInterfaceName = ">>>> modify networkInterface in us.codecraft.webmagic.utils.ProxyUtils"; - Enumeration enumeration = null; - try { - enumeration = NetworkInterface.getNetworkInterfaces(); - } catch (SocketException e1) { - e1.printStackTrace(); - } - while (enumeration.hasMoreElements()) { - NetworkInterface networkInterface = enumeration.nextElement(); - - Enumeration addr = networkInterface.getInetAddresses(); - while (addr.hasMoreElements()) { - String s = addr.nextElement().getHostAddress(); - Pattern IPV4_PATTERN = Pattern.compile("^(25[0-5]|2[0-4]\\d|[0-1]?\\d?\\d)(\\.(25[0-5]|2[0-4]\\d|[0-1]?\\d?\\d)){3}$"); - if (s != null && IPV4_PATTERN.matcher(s).matches()) { - networkInterfaceName += networkInterface.toString() + "IP:" + s + "\n\n"; - } - } - } - return networkInterfaceName; } + } diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpClientDownloaderTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpClientDownloaderTest.java index 5440b338..fd1f4c2f 100644 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpClientDownloaderTest.java +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpClientDownloaderTest.java @@ -5,7 +5,7 @@ import com.github.dreamhead.moco.Runnable; import com.github.dreamhead.moco.Runner; import org.apache.commons.io.IOUtils; import org.apache.http.client.methods.CloseableHttpResponse; -import org.apache.http.client.methods.RequestBuilder; +import org.apache.http.client.methods.HttpUriRequest; import org.apache.http.impl.client.CloseableHttpClient; import org.apache.http.impl.client.HttpClients; import org.apache.http.util.EntityUtils; @@ -87,12 +87,12 @@ public class HttpClientDownloaderTest { private String getCharsetByUrl(String url) { HttpClientDownloader downloader = new HttpClientDownloader(); Site site = Site.me(); - CloseableHttpClient httpClient = new HttpClientGenerator().getClient(site, null); + CloseableHttpClient httpClient = new HttpClientGenerator().getClient(site); // encoding in http header Content-Type Request requestGBK = new Request(url); CloseableHttpResponse httpResponse = null; try { - httpResponse = httpClient.execute(downloader.getHttpUriRequest(requestGBK, site, null,null)); + httpResponse = httpClient.execute(new HttpUriRequestConverter().convert(requestGBK, site, null)); } catch (IOException e) { e.printStackTrace(); } @@ -117,31 +117,32 @@ public class HttpClientDownloaderTest { server.delete(eq(query("q"), "webmagic")).response("delete"); server.request(and(by(method("HEAD")),eq(query("q"), "webmagic"))).response(header("method","head")); server.request(and(by(method("TRACE")),eq(query("q"), "webmagic"))).response("trace"); + final HttpUriRequestConverter httpUriRequestConverter = new HttpUriRequestConverter(); + final Site site = Site.me(); Runner.running(server, new Runnable() { @Override public void run() throws Exception { - HttpClientDownloader httpClientDownloader = new HttpClientDownloader(); Request request = new Request(); request.setUrl("http://127.0.0.1:12306/search"); request.putParams("q", "webmagic"); request.setMethod(HttpConstant.Method.GET); - RequestBuilder requestBuilder = httpClientDownloader.selectRequestMethod(request).setUri(request.getUrl()); - assertThat(EntityUtils.toString(HttpClients.custom().build().execute(requestBuilder.build()).getEntity())).isEqualTo("get"); + HttpUriRequest httpUriRequest = httpUriRequestConverter.convert(request,site,null); + assertThat(EntityUtils.toString(HttpClients.custom().build().execute(httpUriRequest).getEntity())).isEqualTo("get"); request.setMethod(HttpConstant.Method.POST); - requestBuilder = httpClientDownloader.selectRequestMethod(request).setUri(request.getUrl()); - assertThat(EntityUtils.toString(HttpClients.custom().build().execute(requestBuilder.build()).getEntity())).isEqualTo("post"); + httpUriRequest = httpUriRequestConverter.convert(request, site, null); + assertThat(EntityUtils.toString(HttpClients.custom().build().execute(httpUriRequest).getEntity())).isEqualTo("post"); request.setMethod(HttpConstant.Method.PUT); - requestBuilder = httpClientDownloader.selectRequestMethod(request).setUri(request.getUrl()); - assertThat(EntityUtils.toString(HttpClients.custom().build().execute(requestBuilder.build()).getEntity())).isEqualTo("put"); + httpUriRequest = httpUriRequestConverter.convert(request, site, null); + assertThat(EntityUtils.toString(HttpClients.custom().build().execute(httpUriRequest).getEntity())).isEqualTo("put"); request.setMethod(HttpConstant.Method.DELETE); - requestBuilder = httpClientDownloader.selectRequestMethod(request).setUri(request.getUrl()); - assertThat(EntityUtils.toString(HttpClients.custom().build().execute(requestBuilder.build()).getEntity())).isEqualTo("delete"); + httpUriRequest = httpUriRequestConverter.convert(request, site, null); + assertThat(EntityUtils.toString(HttpClients.custom().build().execute(httpUriRequest).getEntity())).isEqualTo("delete"); request.setMethod(HttpConstant.Method.HEAD); - requestBuilder = httpClientDownloader.selectRequestMethod(request).setUri(request.getUrl()); - assertThat(HttpClients.custom().build().execute(requestBuilder.build()).getFirstHeader("method").getValue()).isEqualTo("head"); + httpUriRequest = httpUriRequestConverter.convert(request, site, null); + assertThat(HttpClients.custom().build().execute(httpUriRequest).getFirstHeader("method").getValue()).isEqualTo("head"); request.setMethod(HttpConstant.Method.TRACE); - requestBuilder = httpClientDownloader.selectRequestMethod(request).setUri(request.getUrl()); - assertThat(EntityUtils.toString(HttpClients.custom().build().execute(requestBuilder.build()).getEntity())).isEqualTo("trace"); + httpUriRequest = httpUriRequestConverter.convert(request, site, null); + assertThat(EntityUtils.toString(HttpClients.custom().build().execute(httpUriRequest).getEntity())).isEqualTo("trace"); } }); } @@ -156,7 +157,7 @@ public class HttpClientDownloaderTest { final HttpClientDownloader httpClientDownloader = new HttpClientDownloader(); Request request = new Request(); request.setUrl("http://127.0.0.1:12306/"); - Page page = httpClientDownloader.download(request, null); + Page page = httpClientDownloader.download(request, Site.me().toTask()); assertThat(page.getRawText()).isEqualTo("foo"); } }); diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/proxy/ProxyTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/proxy/ProxyTest.java index 64773236..86af3672 100644 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/proxy/ProxyTest.java +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/proxy/ProxyTest.java @@ -2,13 +2,10 @@ package us.codecraft.webmagic.proxy; import org.apache.http.HttpHost; import org.junit.BeforeClass; -import org.junit.Test; import java.util.ArrayList; import java.util.List; -import static org.assertj.core.api.Assertions.assertThat; - /** * @author yxssfxwzy@sina.com May 30, 2014 * @@ -27,30 +24,6 @@ public class ProxyTest { } } - @Test - public void testProxy() { - TimerReuseProxyPool proxyPool = new TimerReuseProxyPool(httpProxyList,false); - proxyPool.setReuseInterval(500); - assertThat(proxyPool.getIdleNum()).isEqualTo(4); - for (int i = 0; i < 2; i++) { - List fetchList = new ArrayList(); - while (proxyPool.getIdleNum() != 0) { - Proxy proxy = proxyPool.getProxy(); - HttpHost httphost = proxy.getHttpHost(); - // httphostList.add(httphost); - System.out.println(httphost.getHostName() + ":" + httphost.getPort()); - Fetch tmp = new Fetch(httphost); - tmp.start(); - fetchList.add(tmp); - } - for (Fetch fetch : fetchList) { - proxyPool.returnProxy(fetch.hp, Proxy.SUCCESS); - } - System.out.println(proxyPool.allProxyStatus()); - - } - } - class Fetch extends Thread { HttpHost hp;