From b3a282e58d63e98ca19ed8eedac0b3681761d4cf Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Tue, 10 Jun 2014 00:05:30 +0800 Subject: [PATCH] some fix for tests #130 --- .../main/java/us/codecraft/webmagic/Site.java | 7 +- .../downloader/HttpClientDownloader.java | 4 +- .../codecraft/webmagic/proxy/ProxyPool.java | 599 +++++++++--------- .../codecraft/webmagic/proxy/ProxyTest.java | 15 +- 4 files changed, 305 insertions(+), 320 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java index 01a4c75a..3d383330 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java @@ -49,7 +49,7 @@ public class Site { private HttpHost httpProxy; - private ProxyPool httpProxyPool=new ProxyPool(); + private ProxyPool httpProxyPool; private boolean useGzip = true; @@ -453,6 +453,11 @@ public class Site { return this; } + public Site enableHttpProxyPool() { + this.httpProxyPool=new ProxyPool(); + return this; + } + public ProxyPool getHttpProxyPool() { return httpProxyPool; } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java index bdafea79..1891ef53 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java @@ -141,8 +141,8 @@ public class HttpClientDownloader extends AbstractDownloader { .setSocketTimeout(site.getTimeOut()) .setConnectTimeout(site.getTimeOut()) .setCookieSpec(CookieSpecs.BEST_MATCH); - if (site.getHttpProxyPool().isEnable()) { - HttpHost host = site.getHttpProxyFromPool(); + if (site.getHttpProxyPool() != null && site.getHttpProxyPool().isEnable()) { + HttpHost host = site.getHttpProxyFromPool(); requestConfigBuilder.setProxy(host); request.putExtra(Request.PROXY, host); } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ProxyPool.java b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ProxyPool.java index d7ad2756..3996ffb0 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ProxyPool.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ProxyPool.java @@ -1,324 +1,311 @@ package us.codecraft.webmagic.proxy; -import java.io.File; -import java.io.FileInputStream; -import java.io.FileNotFoundException; -import java.io.FileOutputStream; -import java.io.IOException; -import java.io.ObjectInputStream; -import java.io.ObjectOutputStream; +import org.apache.http.HttpHost; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import us.codecraft.webmagic.utils.FilePersistentBase; +import us.codecraft.webmagic.utils.ProxyUtils; + +import java.io.*; import java.net.InetAddress; import java.net.UnknownHostException; -import java.util.ArrayList; -import java.util.HashMap; -import java.util.List; -import java.util.Map; -import java.util.NoSuchElementException; -import java.util.Timer; -import java.util.TimerTask; +import java.util.*; import java.util.Map.Entry; import java.util.concurrent.BlockingQueue; import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.DelayQueue; -import org.apache.http.HttpHost; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import us.codecraft.webmagic.utils.FilePersistentBase; -import us.codecraft.webmagic.utils.ProxyUtils; - /** * Pooled Proxy Object - * + * * @author yxssfxwzy@sina.com
- * @since 0.5.1 * @see Proxy + * @since 0.5.1 */ public class ProxyPool { - private Logger logger = LoggerFactory.getLogger(getClass()); - - private BlockingQueue proxyQueue = new DelayQueue(); - private Map allProxy = new ConcurrentHashMap(); - - private int reuseInterval = 1500;// ms - private int reviveTime = 2 * 60 * 60 * 1000;// ms - private int saveProxyInterval = 10 * 60 * 1000;// ms - - private boolean isEnable = false; - private boolean validateWhenInit = false; - // private boolean isUseLastProxy = true; - private String proxyFilePath = "/data/webmagic/lastUse.proxy"; - - private FilePersistentBase fBase = new FilePersistentBase(); - - private Timer timer = new Timer(true); - private TimerTask saveProxyTask = new TimerTask() { - - @Override - public void run() { - saveProxyList(); - logger.info(allProxyStatus()); - } - }; - - public ProxyPool() { - this(null, true); - } - - public ProxyPool(List httpProxyList) { - this(httpProxyList, true); - } - - public ProxyPool(List httpProxyList, boolean isUseLastProxy) { - if (httpProxyList != null) { - addProxy(httpProxyList.toArray(new String[httpProxyList.size()][])); - } - if (isUseLastProxy) { - if (!new File(proxyFilePath).exists()) { - setFilePath(); - } - setFilePath(); - readProxyList(); - timer.schedule(saveProxyTask, 0, saveProxyInterval); - } - } - - private void setFilePath() { - String tmpDir = System.getProperty("java.io.tmpdir"); - String path = tmpDir + "webmagic\\lastUse.proxy"; - if (tmpDir != null && new File(tmpDir).isDirectory()) { - fBase.setPath(tmpDir + "webmagic"); - File f = fBase.getFile(path); - if (!f.exists()) { - try { - f.createNewFile(); - - } catch (IOException e) { - logger.error("proxy file create error", e); - } - } - - } else { - logger.error("java tmp dir not exists"); - } - this.proxyFilePath = path; - } - - private void saveProxyList() { - if (allProxy.size() == 0) { - return; - } - try { - ObjectOutputStream os = new ObjectOutputStream(new FileOutputStream(fBase.getFile(proxyFilePath))); - os.writeObject(prepareForSaving()); - os.close(); - logger.info("save proxy"); - } catch (FileNotFoundException e) { - logger.error("proxy file not found", e); - } catch (IOException e) { - e.printStackTrace(); - } - } - - private Map prepareForSaving() { - Map tmp = new HashMap(); - for (Entry e : allProxy.entrySet()) { - Proxy p = e.getValue(); - p.setFailedNum(0); - tmp.put(e.getKey(), p); - } - return tmp; - } - - private void readProxyList() { - try { - ObjectInputStream is = new ObjectInputStream(new FileInputStream(fBase.getFile(proxyFilePath))); - addProxy((Map) is.readObject()); - is.close(); - } catch (FileNotFoundException e) { - logger.info("last use proxy file not found", e); - } catch (IOException e) { - // e.printStackTrace(); - } catch (ClassNotFoundException e) { - // e.printStackTrace(); - } - } - - private void addProxy(Map httpProxyMap) { - isEnable = true; - for (Entry entry : httpProxyMap.entrySet()) { - try { - if (allProxy.containsKey(entry.getKey())) { - continue; - } - if (!validateWhenInit || ProxyUtils.validateProxy(entry.getValue().getHttpHost())) { - entry.getValue().setFailedNum(0); - entry.getValue().setReuseTimeInterval(reuseInterval); - proxyQueue.add(entry.getValue()); - allProxy.put(entry.getKey(), entry.getValue()); - } - } catch (NumberFormatException e) { - logger.error("HttpHost init error:", e); - } - } - logger.info("proxy pool size>>>>" + allProxy.size()); - } - - public void addProxy(String[]... httpProxyList) { - isEnable = true; - for (String[] s : httpProxyList) { - try { - if (allProxy.containsKey(s[0])) { - continue; - } - HttpHost item = new HttpHost(InetAddress.getByName(s[0]), Integer.valueOf(s[1])); - if (!validateWhenInit || ProxyUtils.validateProxy(item)) { - Proxy p = new Proxy(item, reuseInterval); - proxyQueue.add(p); - allProxy.put(s[0], p); - } - } catch (NumberFormatException e) { - logger.error("HttpHost init error:", e); - } catch (UnknownHostException e) { - logger.error("HttpHost init error:", e); - } - } - logger.info("proxy pool size>>>>" + allProxy.size()); - } - - public HttpHost getProxy() { - Proxy proxy = null; - try { - Long time = System.currentTimeMillis(); - proxy = proxyQueue.take(); - double costTime = (System.currentTimeMillis() - time) / 1000.0; - if (costTime > reuseInterval) { - logger.info("get proxy time >>>> " + costTime); - } - Proxy p = allProxy.get(proxy.getHttpHost().getAddress().getHostAddress()); - p.setLastBorrowTime(System.currentTimeMillis()); - p.borrowNumIncrement(1); - } catch (InterruptedException e) { - logger.error("get proxy error", e); - } - if (proxy == null) { - throw new NoSuchElementException(); - } - return proxy.getHttpHost(); - } - - public void returnProxy(HttpHost host, int statusCode) { - Proxy p = allProxy.get(host.getAddress().getHostAddress()); - if (p == null) { - return; - } - switch (statusCode) { - case Proxy.SUCCESS: - p.setReuseTimeInterval(reuseInterval); - p.setFailedNum(0); - p.setFailedErrorType(new ArrayList()); - p.recordResponse(); - p.successNumIncrement(1); - break; - case Proxy.ERROR_403: - // banned,try longer interval - p.fail(Proxy.ERROR_403); - p.setReuseTimeInterval(reuseInterval * p.getFailedNum()); - logger.info(host + " >>>> reuseTimeInterval is >>>> " + p.getReuseTimeInterval() / 1000.0); - break; - case Proxy.ERROR_BANNED: - p.fail(Proxy.ERROR_BANNED); - p.setReuseTimeInterval(10 * 60 * 1000 * p.getFailedNum()); - logger.warn("this proxy is banned >>>> " + p.getHttpHost()); - logger.info(host + " >>>> reuseTimeInterval is >>>> " + p.getReuseTimeInterval() / 1000.0); - break; - case Proxy.ERROR_404: - // p.fail(Proxy.ERROR_404); - // p.setReuseTimeInterval(reuseInterval * p.getFailedNum()); - break; - default: - p.fail(statusCode); - break; - } - if (p.getFailedNum() > 20) { - p.setReuseTimeInterval(reviveTime); - logger.error("remove proxy >>>> " + host + ">>>>" + p.getFailedType() + " >>>> remain proxy >>>> " + proxyQueue.size()); - return; - } - if (p.getFailedNum() > 0 && p.getFailedNum() % 5 == 0) { - if (!ProxyUtils.validateProxy(host)) { - p.setReuseTimeInterval(reviveTime); - logger.error("remove proxy >>>> " + host + ">>>>" + p.getFailedType() + " >>>> remain proxy >>>> " + proxyQueue.size()); - return; - } - } - try { - proxyQueue.put(p); - } catch (InterruptedException e) { - logger.warn("proxyQueue return proxy error", e); - } - } - - public String allProxyStatus() { - String re = "all proxy info >>>> \n"; - for (Entry entry : allProxy.entrySet()) { - re += entry.getValue().toString() + "\n"; - } - return re; - } - - public int getIdleNum() { - return proxyQueue.size(); - } - - public int getReuseInterval() { - return reuseInterval; - } - - public void setReuseInterval(int reuseInterval) { - this.reuseInterval = reuseInterval; - } - - public void enable(boolean isEnable) { - this.isEnable = isEnable; - } - - public boolean isEnable() { - return isEnable; - } - - public int getReviveTime() { - return reviveTime; - } - - public void setReviveTime(int reviveTime) { - this.reviveTime = reviveTime; - } - - public boolean isValidateWhenInit() { - return validateWhenInit; - } - - public void validateWhenInit(boolean validateWhenInit) { - this.validateWhenInit = validateWhenInit; - } - - public int getSaveProxyInterval() { - return saveProxyInterval; - } - - public void setSaveProxyInterval(int saveProxyInterval) { - this.saveProxyInterval = saveProxyInterval; - } - - public String getProxyFilePath() { - return proxyFilePath; - } - - public void setProxyFilePath(String proxyFilePath) { - this.proxyFilePath = proxyFilePath; - } + private Logger logger = LoggerFactory.getLogger(getClass()); + + private BlockingQueue proxyQueue = new DelayQueue(); + private Map allProxy = new ConcurrentHashMap(); + + private int reuseInterval = 1500;// ms + private int reviveTime = 2 * 60 * 60 * 1000;// ms + private int saveProxyInterval = 10 * 60 * 1000;// ms + + private boolean isEnable = false; + private boolean validateWhenInit = false; + // private boolean isUseLastProxy = true; + private String proxyFilePath = "/data/webmagic/lastUse.proxy"; + + private FilePersistentBase fBase = new FilePersistentBase(); + + private Timer timer = new Timer(true); + private TimerTask saveProxyTask = new TimerTask() { + + @Override + public void run() { + saveProxyList(); + logger.info(allProxyStatus()); + } + }; + + public ProxyPool() { + this(null, true); + } + + public ProxyPool(List httpProxyList) { + this(httpProxyList, true); + } + + public ProxyPool(List httpProxyList, boolean isUseLastProxy) { + if (httpProxyList != null) { + addProxy(httpProxyList.toArray(new String[httpProxyList.size()][])); + } + if (isUseLastProxy) { + if (!new File(proxyFilePath).exists()) { + setFilePath(); + } + setFilePath(); + readProxyList(); + timer.schedule(saveProxyTask, 0, saveProxyInterval); + } + } + + private void setFilePath() { + String tmpDir = System.getProperty("java.io.tmpdir"); + String path = tmpDir + FilePersistentBase.PATH_SEPERATOR + "webmagic" + FilePersistentBase.PATH_SEPERATOR + "lastUse.proxy"; + if (tmpDir != null && new File(tmpDir).isDirectory()) { + fBase.setPath(tmpDir + FilePersistentBase.PATH_SEPERATOR + "webmagic"); + File f = fBase.getFile(path); + if (!f.exists()) { + try { + f.createNewFile(); + + } catch (IOException e) { + logger.error("proxy file create error", e); + } + } + + } else { + logger.error("java tmp dir not exists"); + } + this.proxyFilePath = path; + } + + private void saveProxyList() { + if (allProxy.size() == 0) { + return; + } + try { + ObjectOutputStream os = new ObjectOutputStream(new FileOutputStream(fBase.getFile(proxyFilePath))); + os.writeObject(prepareForSaving()); + os.close(); + logger.info("save proxy"); + } catch (FileNotFoundException e) { + logger.error("proxy file not found", e); + } catch (IOException e) { + e.printStackTrace(); + } + } + + private Map prepareForSaving() { + Map tmp = new HashMap(); + for (Entry e : allProxy.entrySet()) { + Proxy p = e.getValue(); + p.setFailedNum(0); + tmp.put(e.getKey(), p); + } + return tmp; + } + + private void readProxyList() { + try { + ObjectInputStream is = new ObjectInputStream(new FileInputStream(fBase.getFile(proxyFilePath))); + addProxy((Map) is.readObject()); + is.close(); + } catch (FileNotFoundException e) { + logger.info("last use proxy file not found", e); + } catch (IOException e) { + // e.printStackTrace(); + } catch (ClassNotFoundException e) { + // e.printStackTrace(); + } + } + + private void addProxy(Map httpProxyMap) { + isEnable = true; + for (Entry entry : httpProxyMap.entrySet()) { + try { + if (allProxy.containsKey(entry.getKey())) { + continue; + } + if (!validateWhenInit || ProxyUtils.validateProxy(entry.getValue().getHttpHost())) { + entry.getValue().setFailedNum(0); + entry.getValue().setReuseTimeInterval(reuseInterval); + proxyQueue.add(entry.getValue()); + allProxy.put(entry.getKey(), entry.getValue()); + } + } catch (NumberFormatException e) { + logger.error("HttpHost init error:", e); + } + } + logger.info("proxy pool size>>>>" + allProxy.size()); + } + + public void addProxy(String[]... httpProxyList) { + isEnable = true; + for (String[] s : httpProxyList) { + try { + if (allProxy.containsKey(s[0])) { + continue; + } + HttpHost item = new HttpHost(InetAddress.getByName(s[0]), Integer.valueOf(s[1])); + if (!validateWhenInit || ProxyUtils.validateProxy(item)) { + Proxy p = new Proxy(item, reuseInterval); + proxyQueue.add(p); + allProxy.put(s[0], p); + } + } catch (NumberFormatException e) { + logger.error("HttpHost init error:", e); + } catch (UnknownHostException e) { + logger.error("HttpHost init error:", e); + } + } + logger.info("proxy pool size>>>>" + allProxy.size()); + } + + public HttpHost getProxy() { + Proxy proxy = null; + try { + Long time = System.currentTimeMillis(); + proxy = proxyQueue.take(); + double costTime = (System.currentTimeMillis() - time) / 1000.0; + if (costTime > reuseInterval) { + logger.info("get proxy time >>>> " + costTime); + } + Proxy p = allProxy.get(proxy.getHttpHost().getAddress().getHostAddress()); + p.setLastBorrowTime(System.currentTimeMillis()); + p.borrowNumIncrement(1); + } catch (InterruptedException e) { + logger.error("get proxy error", e); + } + if (proxy == null) { + throw new NoSuchElementException(); + } + return proxy.getHttpHost(); + } + + public void returnProxy(HttpHost host, int statusCode) { + Proxy p = allProxy.get(host.getAddress().getHostAddress()); + if (p == null) { + return; + } + switch (statusCode) { + case Proxy.SUCCESS: + p.setReuseTimeInterval(reuseInterval); + p.setFailedNum(0); + p.setFailedErrorType(new ArrayList()); + p.recordResponse(); + p.successNumIncrement(1); + break; + case Proxy.ERROR_403: + // banned,try longer interval + p.fail(Proxy.ERROR_403); + p.setReuseTimeInterval(reuseInterval * p.getFailedNum()); + logger.info(host + " >>>> reuseTimeInterval is >>>> " + p.getReuseTimeInterval() / 1000.0); + break; + case Proxy.ERROR_BANNED: + p.fail(Proxy.ERROR_BANNED); + p.setReuseTimeInterval(10 * 60 * 1000 * p.getFailedNum()); + logger.warn("this proxy is banned >>>> " + p.getHttpHost()); + logger.info(host + " >>>> reuseTimeInterval is >>>> " + p.getReuseTimeInterval() / 1000.0); + break; + case Proxy.ERROR_404: + // p.fail(Proxy.ERROR_404); + // p.setReuseTimeInterval(reuseInterval * p.getFailedNum()); + break; + default: + p.fail(statusCode); + break; + } + if (p.getFailedNum() > 20) { + p.setReuseTimeInterval(reviveTime); + logger.error("remove proxy >>>> " + host + ">>>>" + p.getFailedType() + " >>>> remain proxy >>>> " + proxyQueue.size()); + return; + } + if (p.getFailedNum() > 0 && p.getFailedNum() % 5 == 0) { + if (!ProxyUtils.validateProxy(host)) { + p.setReuseTimeInterval(reviveTime); + logger.error("remove proxy >>>> " + host + ">>>>" + p.getFailedType() + " >>>> remain proxy >>>> " + proxyQueue.size()); + return; + } + } + try { + proxyQueue.put(p); + } catch (InterruptedException e) { + logger.warn("proxyQueue return proxy error", e); + } + } + + public String allProxyStatus() { + String re = "all proxy info >>>> \n"; + for (Entry entry : allProxy.entrySet()) { + re += entry.getValue().toString() + "\n"; + } + return re; + } + + public int getIdleNum() { + return proxyQueue.size(); + } + + public int getReuseInterval() { + return reuseInterval; + } + + public void setReuseInterval(int reuseInterval) { + this.reuseInterval = reuseInterval; + } + + public void enable(boolean isEnable) { + this.isEnable = isEnable; + } + + public boolean isEnable() { + return isEnable; + } + + public int getReviveTime() { + return reviveTime; + } + + public void setReviveTime(int reviveTime) { + this.reviveTime = reviveTime; + } + + public boolean isValidateWhenInit() { + return validateWhenInit; + } + + public void validateWhenInit(boolean validateWhenInit) { + this.validateWhenInit = validateWhenInit; + } + + public int getSaveProxyInterval() { + return saveProxyInterval; + } + + public void setSaveProxyInterval(int saveProxyInterval) { + this.saveProxyInterval = saveProxyInterval; + } + + public String getProxyFilePath() { + return proxyFilePath; + } + + public void setProxyFilePath(String proxyFilePath) { + this.proxyFilePath = proxyFilePath; + } } diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/proxy/ProxyTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/proxy/ProxyTest.java index 9d3d420d..fd5827fe 100644 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/proxy/ProxyTest.java +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/proxy/ProxyTest.java @@ -1,16 +1,14 @@ package us.codecraft.webmagic.proxy; -import static org.assertj.core.api.Assertions.assertThat; +import org.apache.http.HttpHost; +import org.junit.BeforeClass; +import org.junit.Test; import java.io.File; import java.util.ArrayList; import java.util.List; -import org.apache.http.HttpHost; -import org.junit.BeforeClass; -import org.junit.Test; - -import us.codecraft.webmagic.Request; +import static org.assertj.core.api.Assertions.assertThat; /** * @author yxssfxwzy@sina.com May 30, 2014 @@ -30,11 +28,6 @@ public class ProxyTest { } } - @Test - public void testAddProxy() { - - } - @Test public void testProxy() { ProxyPool proxyPool = new ProxyPool(httpProxyList);