some fix for tests #130

pull/157/head
yihua.huang 11 years ago
parent b75e64a61b
commit b3a282e58d

@ -49,7 +49,7 @@ public class Site {
private HttpHost httpProxy; private HttpHost httpProxy;
private ProxyPool httpProxyPool=new ProxyPool(); private ProxyPool httpProxyPool;
private boolean useGzip = true; private boolean useGzip = true;
@ -453,6 +453,11 @@ public class Site {
return this; return this;
} }
public Site enableHttpProxyPool() {
this.httpProxyPool=new ProxyPool();
return this;
}
public ProxyPool getHttpProxyPool() { public ProxyPool getHttpProxyPool() {
return httpProxyPool; return httpProxyPool;
} }

@ -141,8 +141,8 @@ public class HttpClientDownloader extends AbstractDownloader {
.setSocketTimeout(site.getTimeOut()) .setSocketTimeout(site.getTimeOut())
.setConnectTimeout(site.getTimeOut()) .setConnectTimeout(site.getTimeOut())
.setCookieSpec(CookieSpecs.BEST_MATCH); .setCookieSpec(CookieSpecs.BEST_MATCH);
if (site.getHttpProxyPool().isEnable()) { if (site.getHttpProxyPool() != null && site.getHttpProxyPool().isEnable()) {
HttpHost host = site.getHttpProxyFromPool(); HttpHost host = site.getHttpProxyFromPool();
requestConfigBuilder.setProxy(host); requestConfigBuilder.setProxy(host);
request.putExtra(Request.PROXY, host); request.putExtra(Request.PROXY, host);
} }

@ -1,324 +1,311 @@
package us.codecraft.webmagic.proxy; package us.codecraft.webmagic.proxy;
import java.io.File; import org.apache.http.HttpHost;
import java.io.FileInputStream; import org.slf4j.Logger;
import java.io.FileNotFoundException; import org.slf4j.LoggerFactory;
import java.io.FileOutputStream; import us.codecraft.webmagic.utils.FilePersistentBase;
import java.io.IOException; import us.codecraft.webmagic.utils.ProxyUtils;
import java.io.ObjectInputStream;
import java.io.ObjectOutputStream; import java.io.*;
import java.net.InetAddress; import java.net.InetAddress;
import java.net.UnknownHostException; import java.net.UnknownHostException;
import java.util.ArrayList; import java.util.*;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.NoSuchElementException;
import java.util.Timer;
import java.util.TimerTask;
import java.util.Map.Entry; import java.util.Map.Entry;
import java.util.concurrent.BlockingQueue; import java.util.concurrent.BlockingQueue;
import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.DelayQueue; import java.util.concurrent.DelayQueue;
import org.apache.http.HttpHost;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import us.codecraft.webmagic.utils.FilePersistentBase;
import us.codecraft.webmagic.utils.ProxyUtils;
/** /**
* Pooled Proxy Object * Pooled Proxy Object
* *
* @author yxssfxwzy@sina.com <br> * @author yxssfxwzy@sina.com <br>
* @since 0.5.1
* @see Proxy * @see Proxy
* @since 0.5.1
*/ */
public class ProxyPool { public class ProxyPool {
private Logger logger = LoggerFactory.getLogger(getClass()); private Logger logger = LoggerFactory.getLogger(getClass());
private BlockingQueue<Proxy> proxyQueue = new DelayQueue<Proxy>(); private BlockingQueue<Proxy> proxyQueue = new DelayQueue<Proxy>();
private Map<String, Proxy> allProxy = new ConcurrentHashMap<String, Proxy>(); private Map<String, Proxy> allProxy = new ConcurrentHashMap<String, Proxy>();
private int reuseInterval = 1500;// ms private int reuseInterval = 1500;// ms
private int reviveTime = 2 * 60 * 60 * 1000;// ms private int reviveTime = 2 * 60 * 60 * 1000;// ms
private int saveProxyInterval = 10 * 60 * 1000;// ms private int saveProxyInterval = 10 * 60 * 1000;// ms
private boolean isEnable = false; private boolean isEnable = false;
private boolean validateWhenInit = false; private boolean validateWhenInit = false;
// private boolean isUseLastProxy = true; // private boolean isUseLastProxy = true;
private String proxyFilePath = "/data/webmagic/lastUse.proxy"; private String proxyFilePath = "/data/webmagic/lastUse.proxy";
private FilePersistentBase fBase = new FilePersistentBase(); private FilePersistentBase fBase = new FilePersistentBase();
private Timer timer = new Timer(true); private Timer timer = new Timer(true);
private TimerTask saveProxyTask = new TimerTask() { private TimerTask saveProxyTask = new TimerTask() {
@Override @Override
public void run() { public void run() {
saveProxyList(); saveProxyList();
logger.info(allProxyStatus()); logger.info(allProxyStatus());
} }
}; };
public ProxyPool() { public ProxyPool() {
this(null, true); this(null, true);
} }
public ProxyPool(List<String[]> httpProxyList) { public ProxyPool(List<String[]> httpProxyList) {
this(httpProxyList, true); this(httpProxyList, true);
} }
public ProxyPool(List<String[]> httpProxyList, boolean isUseLastProxy) { public ProxyPool(List<String[]> httpProxyList, boolean isUseLastProxy) {
if (httpProxyList != null) { if (httpProxyList != null) {
addProxy(httpProxyList.toArray(new String[httpProxyList.size()][])); addProxy(httpProxyList.toArray(new String[httpProxyList.size()][]));
} }
if (isUseLastProxy) { if (isUseLastProxy) {
if (!new File(proxyFilePath).exists()) { if (!new File(proxyFilePath).exists()) {
setFilePath(); setFilePath();
} }
setFilePath(); setFilePath();
readProxyList(); readProxyList();
timer.schedule(saveProxyTask, 0, saveProxyInterval); timer.schedule(saveProxyTask, 0, saveProxyInterval);
} }
} }
private void setFilePath() { private void setFilePath() {
String tmpDir = System.getProperty("java.io.tmpdir"); String tmpDir = System.getProperty("java.io.tmpdir");
String path = tmpDir + "webmagic\\lastUse.proxy"; String path = tmpDir + FilePersistentBase.PATH_SEPERATOR + "webmagic" + FilePersistentBase.PATH_SEPERATOR + "lastUse.proxy";
if (tmpDir != null && new File(tmpDir).isDirectory()) { if (tmpDir != null && new File(tmpDir).isDirectory()) {
fBase.setPath(tmpDir + "webmagic"); fBase.setPath(tmpDir + FilePersistentBase.PATH_SEPERATOR + "webmagic");
File f = fBase.getFile(path); File f = fBase.getFile(path);
if (!f.exists()) { if (!f.exists()) {
try { try {
f.createNewFile(); f.createNewFile();
} catch (IOException e) { } catch (IOException e) {
logger.error("proxy file create error", e); logger.error("proxy file create error", e);
} }
} }
} else { } else {
logger.error("java tmp dir not exists"); logger.error("java tmp dir not exists");
} }
this.proxyFilePath = path; this.proxyFilePath = path;
} }
private void saveProxyList() { private void saveProxyList() {
if (allProxy.size() == 0) { if (allProxy.size() == 0) {
return; return;
} }
try { try {
ObjectOutputStream os = new ObjectOutputStream(new FileOutputStream(fBase.getFile(proxyFilePath))); ObjectOutputStream os = new ObjectOutputStream(new FileOutputStream(fBase.getFile(proxyFilePath)));
os.writeObject(prepareForSaving()); os.writeObject(prepareForSaving());
os.close(); os.close();
logger.info("save proxy"); logger.info("save proxy");
} catch (FileNotFoundException e) { } catch (FileNotFoundException e) {
logger.error("proxy file not found", e); logger.error("proxy file not found", e);
} catch (IOException e) { } catch (IOException e) {
e.printStackTrace(); e.printStackTrace();
} }
} }
private Map<String, Proxy> prepareForSaving() { private Map<String, Proxy> prepareForSaving() {
Map<String, Proxy> tmp = new HashMap<String, Proxy>(); Map<String, Proxy> tmp = new HashMap<String, Proxy>();
for (Entry<String, Proxy> e : allProxy.entrySet()) { for (Entry<String, Proxy> e : allProxy.entrySet()) {
Proxy p = e.getValue(); Proxy p = e.getValue();
p.setFailedNum(0); p.setFailedNum(0);
tmp.put(e.getKey(), p); tmp.put(e.getKey(), p);
} }
return tmp; return tmp;
} }
private void readProxyList() { private void readProxyList() {
try { try {
ObjectInputStream is = new ObjectInputStream(new FileInputStream(fBase.getFile(proxyFilePath))); ObjectInputStream is = new ObjectInputStream(new FileInputStream(fBase.getFile(proxyFilePath)));
addProxy((Map<String, Proxy>) is.readObject()); addProxy((Map<String, Proxy>) is.readObject());
is.close(); is.close();
} catch (FileNotFoundException e) { } catch (FileNotFoundException e) {
logger.info("last use proxy file not found", e); logger.info("last use proxy file not found", e);
} catch (IOException e) { } catch (IOException e) {
// e.printStackTrace(); // e.printStackTrace();
} catch (ClassNotFoundException e) { } catch (ClassNotFoundException e) {
// e.printStackTrace(); // e.printStackTrace();
} }
} }
private void addProxy(Map<String, Proxy> httpProxyMap) { private void addProxy(Map<String, Proxy> httpProxyMap) {
isEnable = true; isEnable = true;
for (Entry<String, Proxy> entry : httpProxyMap.entrySet()) { for (Entry<String, Proxy> entry : httpProxyMap.entrySet()) {
try { try {
if (allProxy.containsKey(entry.getKey())) { if (allProxy.containsKey(entry.getKey())) {
continue; continue;
} }
if (!validateWhenInit || ProxyUtils.validateProxy(entry.getValue().getHttpHost())) { if (!validateWhenInit || ProxyUtils.validateProxy(entry.getValue().getHttpHost())) {
entry.getValue().setFailedNum(0); entry.getValue().setFailedNum(0);
entry.getValue().setReuseTimeInterval(reuseInterval); entry.getValue().setReuseTimeInterval(reuseInterval);
proxyQueue.add(entry.getValue()); proxyQueue.add(entry.getValue());
allProxy.put(entry.getKey(), entry.getValue()); allProxy.put(entry.getKey(), entry.getValue());
} }
} catch (NumberFormatException e) { } catch (NumberFormatException e) {
logger.error("HttpHost init error:", e); logger.error("HttpHost init error:", e);
} }
} }
logger.info("proxy pool size>>>>" + allProxy.size()); logger.info("proxy pool size>>>>" + allProxy.size());
} }
public void addProxy(String[]... httpProxyList) { public void addProxy(String[]... httpProxyList) {
isEnable = true; isEnable = true;
for (String[] s : httpProxyList) { for (String[] s : httpProxyList) {
try { try {
if (allProxy.containsKey(s[0])) { if (allProxy.containsKey(s[0])) {
continue; continue;
} }
HttpHost item = new HttpHost(InetAddress.getByName(s[0]), Integer.valueOf(s[1])); HttpHost item = new HttpHost(InetAddress.getByName(s[0]), Integer.valueOf(s[1]));
if (!validateWhenInit || ProxyUtils.validateProxy(item)) { if (!validateWhenInit || ProxyUtils.validateProxy(item)) {
Proxy p = new Proxy(item, reuseInterval); Proxy p = new Proxy(item, reuseInterval);
proxyQueue.add(p); proxyQueue.add(p);
allProxy.put(s[0], p); allProxy.put(s[0], p);
} }
} catch (NumberFormatException e) { } catch (NumberFormatException e) {
logger.error("HttpHost init error:", e); logger.error("HttpHost init error:", e);
} catch (UnknownHostException e) { } catch (UnknownHostException e) {
logger.error("HttpHost init error:", e); logger.error("HttpHost init error:", e);
} }
} }
logger.info("proxy pool size>>>>" + allProxy.size()); logger.info("proxy pool size>>>>" + allProxy.size());
} }
public HttpHost getProxy() { public HttpHost getProxy() {
Proxy proxy = null; Proxy proxy = null;
try { try {
Long time = System.currentTimeMillis(); Long time = System.currentTimeMillis();
proxy = proxyQueue.take(); proxy = proxyQueue.take();
double costTime = (System.currentTimeMillis() - time) / 1000.0; double costTime = (System.currentTimeMillis() - time) / 1000.0;
if (costTime > reuseInterval) { if (costTime > reuseInterval) {
logger.info("get proxy time >>>> " + costTime); logger.info("get proxy time >>>> " + costTime);
} }
Proxy p = allProxy.get(proxy.getHttpHost().getAddress().getHostAddress()); Proxy p = allProxy.get(proxy.getHttpHost().getAddress().getHostAddress());
p.setLastBorrowTime(System.currentTimeMillis()); p.setLastBorrowTime(System.currentTimeMillis());
p.borrowNumIncrement(1); p.borrowNumIncrement(1);
} catch (InterruptedException e) { } catch (InterruptedException e) {
logger.error("get proxy error", e); logger.error("get proxy error", e);
} }
if (proxy == null) { if (proxy == null) {
throw new NoSuchElementException(); throw new NoSuchElementException();
} }
return proxy.getHttpHost(); return proxy.getHttpHost();
} }
public void returnProxy(HttpHost host, int statusCode) { public void returnProxy(HttpHost host, int statusCode) {
Proxy p = allProxy.get(host.getAddress().getHostAddress()); Proxy p = allProxy.get(host.getAddress().getHostAddress());
if (p == null) { if (p == null) {
return; return;
} }
switch (statusCode) { switch (statusCode) {
case Proxy.SUCCESS: case Proxy.SUCCESS:
p.setReuseTimeInterval(reuseInterval); p.setReuseTimeInterval(reuseInterval);
p.setFailedNum(0); p.setFailedNum(0);
p.setFailedErrorType(new ArrayList<Integer>()); p.setFailedErrorType(new ArrayList<Integer>());
p.recordResponse(); p.recordResponse();
p.successNumIncrement(1); p.successNumIncrement(1);
break; break;
case Proxy.ERROR_403: case Proxy.ERROR_403:
// banned,try longer interval // banned,try longer interval
p.fail(Proxy.ERROR_403); p.fail(Proxy.ERROR_403);
p.setReuseTimeInterval(reuseInterval * p.getFailedNum()); p.setReuseTimeInterval(reuseInterval * p.getFailedNum());
logger.info(host + " >>>> reuseTimeInterval is >>>> " + p.getReuseTimeInterval() / 1000.0); logger.info(host + " >>>> reuseTimeInterval is >>>> " + p.getReuseTimeInterval() / 1000.0);
break; break;
case Proxy.ERROR_BANNED: case Proxy.ERROR_BANNED:
p.fail(Proxy.ERROR_BANNED); p.fail(Proxy.ERROR_BANNED);
p.setReuseTimeInterval(10 * 60 * 1000 * p.getFailedNum()); p.setReuseTimeInterval(10 * 60 * 1000 * p.getFailedNum());
logger.warn("this proxy is banned >>>> " + p.getHttpHost()); logger.warn("this proxy is banned >>>> " + p.getHttpHost());
logger.info(host + " >>>> reuseTimeInterval is >>>> " + p.getReuseTimeInterval() / 1000.0); logger.info(host + " >>>> reuseTimeInterval is >>>> " + p.getReuseTimeInterval() / 1000.0);
break; break;
case Proxy.ERROR_404: case Proxy.ERROR_404:
// p.fail(Proxy.ERROR_404); // p.fail(Proxy.ERROR_404);
// p.setReuseTimeInterval(reuseInterval * p.getFailedNum()); // p.setReuseTimeInterval(reuseInterval * p.getFailedNum());
break; break;
default: default:
p.fail(statusCode); p.fail(statusCode);
break; break;
} }
if (p.getFailedNum() > 20) { if (p.getFailedNum() > 20) {
p.setReuseTimeInterval(reviveTime); p.setReuseTimeInterval(reviveTime);
logger.error("remove proxy >>>> " + host + ">>>>" + p.getFailedType() + " >>>> remain proxy >>>> " + proxyQueue.size()); logger.error("remove proxy >>>> " + host + ">>>>" + p.getFailedType() + " >>>> remain proxy >>>> " + proxyQueue.size());
return; return;
} }
if (p.getFailedNum() > 0 && p.getFailedNum() % 5 == 0) { if (p.getFailedNum() > 0 && p.getFailedNum() % 5 == 0) {
if (!ProxyUtils.validateProxy(host)) { if (!ProxyUtils.validateProxy(host)) {
p.setReuseTimeInterval(reviveTime); p.setReuseTimeInterval(reviveTime);
logger.error("remove proxy >>>> " + host + ">>>>" + p.getFailedType() + " >>>> remain proxy >>>> " + proxyQueue.size()); logger.error("remove proxy >>>> " + host + ">>>>" + p.getFailedType() + " >>>> remain proxy >>>> " + proxyQueue.size());
return; return;
} }
} }
try { try {
proxyQueue.put(p); proxyQueue.put(p);
} catch (InterruptedException e) { } catch (InterruptedException e) {
logger.warn("proxyQueue return proxy error", e); logger.warn("proxyQueue return proxy error", e);
} }
} }
public String allProxyStatus() { public String allProxyStatus() {
String re = "all proxy info >>>> \n"; String re = "all proxy info >>>> \n";
for (Entry<String, Proxy> entry : allProxy.entrySet()) { for (Entry<String, Proxy> entry : allProxy.entrySet()) {
re += entry.getValue().toString() + "\n"; re += entry.getValue().toString() + "\n";
} }
return re; return re;
} }
public int getIdleNum() { public int getIdleNum() {
return proxyQueue.size(); return proxyQueue.size();
} }
public int getReuseInterval() { public int getReuseInterval() {
return reuseInterval; return reuseInterval;
} }
public void setReuseInterval(int reuseInterval) { public void setReuseInterval(int reuseInterval) {
this.reuseInterval = reuseInterval; this.reuseInterval = reuseInterval;
} }
public void enable(boolean isEnable) { public void enable(boolean isEnable) {
this.isEnable = isEnable; this.isEnable = isEnable;
} }
public boolean isEnable() { public boolean isEnable() {
return isEnable; return isEnable;
} }
public int getReviveTime() { public int getReviveTime() {
return reviveTime; return reviveTime;
} }
public void setReviveTime(int reviveTime) { public void setReviveTime(int reviveTime) {
this.reviveTime = reviveTime; this.reviveTime = reviveTime;
} }
public boolean isValidateWhenInit() { public boolean isValidateWhenInit() {
return validateWhenInit; return validateWhenInit;
} }
public void validateWhenInit(boolean validateWhenInit) { public void validateWhenInit(boolean validateWhenInit) {
this.validateWhenInit = validateWhenInit; this.validateWhenInit = validateWhenInit;
} }
public int getSaveProxyInterval() { public int getSaveProxyInterval() {
return saveProxyInterval; return saveProxyInterval;
} }
public void setSaveProxyInterval(int saveProxyInterval) { public void setSaveProxyInterval(int saveProxyInterval) {
this.saveProxyInterval = saveProxyInterval; this.saveProxyInterval = saveProxyInterval;
} }
public String getProxyFilePath() { public String getProxyFilePath() {
return proxyFilePath; return proxyFilePath;
} }
public void setProxyFilePath(String proxyFilePath) { public void setProxyFilePath(String proxyFilePath) {
this.proxyFilePath = proxyFilePath; this.proxyFilePath = proxyFilePath;
} }
} }

@ -1,16 +1,14 @@
package us.codecraft.webmagic.proxy; package us.codecraft.webmagic.proxy;
import static org.assertj.core.api.Assertions.assertThat; import org.apache.http.HttpHost;
import org.junit.BeforeClass;
import org.junit.Test;
import java.io.File; import java.io.File;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.List; import java.util.List;
import org.apache.http.HttpHost; import static org.assertj.core.api.Assertions.assertThat;
import org.junit.BeforeClass;
import org.junit.Test;
import us.codecraft.webmagic.Request;
/** /**
* @author yxssfxwzy@sina.com May 30, 2014 * @author yxssfxwzy@sina.com May 30, 2014
@ -30,11 +28,6 @@ public class ProxyTest {
} }
} }
@Test
public void testAddProxy() {
}
@Test @Test
public void testProxy() { public void testProxy() {
ProxyPool proxyPool = new ProxyPool(httpProxyList); ProxyPool proxyPool = new ProxyPool(httpProxyList);

Loading…
Cancel
Save