t push origin masterMerge branch 'yxssfxwzy-proxy'

pull/157/head
yihua.huang 11 years ago
commit b75e64a61b

@ -424,6 +424,8 @@ public class Spider implements Runnable, Task {
pipeline.process(page.getResultItems(), this);
}
}
//for proxy status management
request.putExtra(Request.STATUS_CODE, page.getStatusCode());
sleep(site.getSleepTime());
}

@ -9,7 +9,8 @@ import java.util.concurrent.TimeUnit;
import org.apache.http.HttpHost;
/**
* >>>>Proxy Status
* >>>> Proxy lifecycle
+----------+ +-----+
| last use | | new |
+-----+----+ +---+-+
@ -44,13 +45,22 @@ import org.apache.http.HttpHost;
| |+-------------------+
+--------+
*/
/**
* Object has these status of lifecycle above.<br>
*
* @author yxssfxwzy@sina.com <br>
* @since 0.5.1
* @see ProxyPool
*/
public class Proxy implements Delayed, Serializable {
private static final long serialVersionUID = 228939737383625551L;
public static final int ERROR_403 = 403;
public static final int ERROR_404 = 404;
public static final int ERROR_BANNED = 10000;
public static final int ERROR_Proxy = 10001;
public static final int ERROR_BANNED = 10000;// banned by website
public static final int ERROR_Proxy = 10001;// the proxy itself failed
public static final int SUCCESS = 200;
private final HttpHost httpHost;
@ -59,7 +69,6 @@ public class Proxy implements Delayed, Serializable {
private Long canReuseTime = 0L;
private Long lastBorrowTime = System.currentTimeMillis();
private Long responseTime = 0L;
private Long idleTime = 0L;
private int failedNum = 0;
private int successNum = 0;
@ -143,7 +152,7 @@ public class Proxy implements Delayed, Serializable {
@Override
public long getDelay(TimeUnit unit) {
return unit.convert(canReuseTime - System.nanoTime(), unit.NANOSECONDS);
return unit.convert(canReuseTime - System.nanoTime(), TimeUnit.NANOSECONDS);
}
@Override

@ -1,26 +1,39 @@
package us.codecraft.webmagic.proxy;
import org.apache.http.HttpHost;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.*;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.ObjectInputStream;
import java.io.ObjectOutputStream;
import java.net.InetAddress;
import java.net.UnknownHostException;
import java.util.*;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.NoSuchElementException;
import java.util.Timer;
import java.util.TimerTask;
import java.util.Map.Entry;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.DelayQueue;
import org.apache.http.HttpHost;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import us.codecraft.webmagic.utils.FilePersistentBase;
import us.codecraft.webmagic.utils.ProxyUtils;
/**
* ClassName:ProxyPool
* Pooled Proxy Object
*
* @see
* @Function: TODO ADD FUNCTION
* @author ch
* @version Ver 1.0
* @Date 2014-2-14 01:10:04
* @author yxssfxwzy@sina.com <br>
* @since 0.5.1
* @see Proxy
*/
public class ProxyPool {
@ -31,10 +44,14 @@ public class ProxyPool {
private int reuseInterval = 1500;// ms
private int reviveTime = 2 * 60 * 60 * 1000;// ms
private int saveProxyInterval = 10 * 60 * 1000;// ms
private boolean isEnable = false;
private boolean validateWhenInit = false;
private String proxyFile = "data/lastUse.proxy";
// private boolean isUseLastProxy = true;
private String proxyFilePath = "/data/webmagic/lastUse.proxy";
private FilePersistentBase fBase = new FilePersistentBase();
private Timer timer = new Timer(true);
private TimerTask saveProxyTask = new TimerTask() {
@ -47,13 +64,46 @@ public class ProxyPool {
};
public ProxyPool() {
this(null, true);
}
public ProxyPool(List<String[]> httpProxyList) {
readProxyList();
addProxy(httpProxyList.toArray(new String[httpProxyList.size()][]));
timer.schedule(saveProxyTask, 10 * 60 * 1000L, 10 * 60 * 1000);
this(httpProxyList, true);
}
public ProxyPool(List<String[]> httpProxyList, boolean isUseLastProxy) {
if (httpProxyList != null) {
addProxy(httpProxyList.toArray(new String[httpProxyList.size()][]));
}
if (isUseLastProxy) {
if (!new File(proxyFilePath).exists()) {
setFilePath();
}
setFilePath();
readProxyList();
timer.schedule(saveProxyTask, 0, saveProxyInterval);
}
}
private void setFilePath() {
String tmpDir = System.getProperty("java.io.tmpdir");
String path = tmpDir + "webmagic\\lastUse.proxy";
if (tmpDir != null && new File(tmpDir).isDirectory()) {
fBase.setPath(tmpDir + "webmagic");
File f = fBase.getFile(path);
if (!f.exists()) {
try {
f.createNewFile();
} catch (IOException e) {
logger.error("proxy file create error", e);
}
}
} else {
logger.error("java tmp dir not exists");
}
this.proxyFilePath = path;
}
private void saveProxyList() {
@ -61,7 +111,7 @@ public class ProxyPool {
return;
}
try {
ObjectOutputStream os = new ObjectOutputStream(new FileOutputStream(proxyFile));
ObjectOutputStream os = new ObjectOutputStream(new FileOutputStream(fBase.getFile(proxyFilePath)));
os.writeObject(prepareForSaving());
os.close();
logger.info("save proxy");
@ -84,15 +134,15 @@ public class ProxyPool {
private void readProxyList() {
try {
ObjectInputStream is = new ObjectInputStream(new FileInputStream(proxyFile));
ObjectInputStream is = new ObjectInputStream(new FileInputStream(fBase.getFile(proxyFilePath)));
addProxy((Map<String, Proxy>) is.readObject());
is.close();
} catch (FileNotFoundException e) {
logger.error("proxy file not found", e);
logger.info("last use proxy file not found", e);
} catch (IOException e) {
e.printStackTrace();
// e.printStackTrace();
} catch (ClassNotFoundException e) {
e.printStackTrace();
// e.printStackTrace();
}
}
@ -103,7 +153,7 @@ public class ProxyPool {
if (allProxy.containsKey(entry.getKey())) {
continue;
}
if (!validateWhenInit || ProxyUtil.validateProxy(entry.getValue().getHttpHost())) {
if (!validateWhenInit || ProxyUtils.validateProxy(entry.getValue().getHttpHost())) {
entry.getValue().setFailedNum(0);
entry.getValue().setReuseTimeInterval(reuseInterval);
proxyQueue.add(entry.getValue());
@ -124,7 +174,7 @@ public class ProxyPool {
continue;
}
HttpHost item = new HttpHost(InetAddress.getByName(s[0]), Integer.valueOf(s[1]));
if (!validateWhenInit || ProxyUtil.validateProxy(item)) {
if (!validateWhenInit || ProxyUtils.validateProxy(item)) {
Proxy p = new Proxy(item, reuseInterval);
proxyQueue.add(p);
allProxy.put(s[0], p);
@ -173,7 +223,7 @@ public class ProxyPool {
p.successNumIncrement(1);
break;
case Proxy.ERROR_403:
// banned,try larger interval
// banned,try longer interval
p.fail(Proxy.ERROR_403);
p.setReuseTimeInterval(reuseInterval * p.getFailedNum());
logger.info(host + " >>>> reuseTimeInterval is >>>> " + p.getReuseTimeInterval() / 1000.0);
@ -185,7 +235,7 @@ public class ProxyPool {
logger.info(host + " >>>> reuseTimeInterval is >>>> " + p.getReuseTimeInterval() / 1000.0);
break;
case Proxy.ERROR_404:
//p.fail(Proxy.ERROR_404);
// p.fail(Proxy.ERROR_404);
// p.setReuseTimeInterval(reuseInterval * p.getFailedNum());
break;
default:
@ -193,14 +243,12 @@ public class ProxyPool {
break;
}
if (p.getFailedNum() > 20) {
// allProxy.remove(host.getAddress().getHostAddress());
p.setReuseTimeInterval(reviveTime);
logger.error("remove proxy >>>> " + host + ">>>>" + p.getFailedType() + " >>>> remain proxy >>>> " + proxyQueue.size());
return;
}
if (p.getFailedNum()%5==0) {
if (!ProxyUtil.validateProxy(host)) {
// allProxy.remove(host.getAddress().getHostAddress());
if (p.getFailedNum() > 0 && p.getFailedNum() % 5 == 0) {
if (!ProxyUtils.validateProxy(host)) {
p.setReuseTimeInterval(reviveTime);
logger.error("remove proxy >>>> " + host + ">>>>" + p.getFailedType() + " >>>> remain proxy >>>> " + proxyQueue.size());
return;
@ -219,7 +267,6 @@ public class ProxyPool {
re += entry.getValue().toString() + "\n";
}
return re;
}
public int getIdleNum() {
@ -234,57 +281,44 @@ public class ProxyPool {
this.reuseInterval = reuseInterval;
}
public static List<String[]> getProxyList() {
List<String[]> proxyList = new ArrayList<String[]>();
BufferedReader br = null;
try {
br = new BufferedReader(new FileReader(new File("proxy.txt")));
public void enable(boolean isEnable) {
this.isEnable = isEnable;
}
String line = "";
while ((line = br.readLine()) != null) {
proxyList.add(new String[] { line.split(":")[0], line.split(":")[1] });
}
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
return proxyList;
public boolean isEnable() {
return isEnable;
}
public static void main(String[] args) throws IOException {
ProxyPool proxyPool = new ProxyPool(getProxyList());
proxyPool.setReuseInterval(10000);
// proxyPool.saveProxyList();
while (true) {
List<HttpHost> httphostList = new ArrayList<HttpHost>();
System.in.read();
int i = 0;
while (proxyPool.getIdleNum() > 2) {
HttpHost httphost = proxyPool.getProxy();
httphostList.add(httphost);
// proxyPool.proxyPool.use(httphost);
proxyPool.logger.info("borrow object>>>>" + i + ">>>>" + httphostList.get(i).toString());
i++;
}
System.out.println(proxyPool.allProxyStatus());
System.in.read();
for (i = 0; i < httphostList.size(); i++) {
proxyPool.returnProxy(httphostList.get(i), 200);
proxyPool.logger.info("return object>>>>" + i + ">>>>" + httphostList.get(i).toString());
}
System.out.println(proxyPool.allProxyStatus());
System.in.read();
}
public int getReviveTime() {
return reviveTime;
}
public void setReviveTime(int reviveTime) {
this.reviveTime = reviveTime;
}
public void enable(boolean isEnable) {
this.isEnable = isEnable;
public boolean isValidateWhenInit() {
return validateWhenInit;
}
public boolean isEnable() {
return isEnable;
public void validateWhenInit(boolean validateWhenInit) {
this.validateWhenInit = validateWhenInit;
}
public int getSaveProxyInterval() {
return saveProxyInterval;
}
public void setSaveProxyInterval(int saveProxyInterval) {
this.saveProxyInterval = saveProxyInterval;
}
public String getProxyFilePath() {
return proxyFilePath;
}
public void setProxyFilePath(String proxyFilePath) {
this.proxyFilePath = proxyFilePath;
}
}

@ -1,4 +1,4 @@
package us.codecraft.webmagic.proxy;
package us.codecraft.webmagic.utils;
import java.io.IOException;
import java.net.Inet6Address;
@ -7,36 +7,54 @@ import java.net.InetSocketAddress;
import java.net.NetworkInterface;
import java.net.Socket;
import java.net.SocketException;
import java.net.UnknownHostException;
import java.util.Enumeration;
import java.util.regex.Pattern;
import org.apache.http.HttpHost;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* ClassName:ProxyUtil
* Pooled Proxy Object
*
* @see
* @author ch
* @version Ver 1.0
* @Date 2014-2-16 04:20:07
* @author yxssfxwzy@sina.com <br>
* @since 0.5.1
*/
public class ProxyUtil {
// TODO 改为单例
public class ProxyUtils {
private static InetAddress localAddr;
private static final Logger logger = LoggerFactory.getLogger(ProxyUtil.class);
private static String networkInterface = "eth7";
private static final Logger logger = LoggerFactory.getLogger(ProxyUtils.class);
static {
init();
}
private static void init() {
// first way to get local IP
try {
localAddr = InetAddress.getLocalHost();
logger.info("local IP:" + localAddr.getHostAddress());
} catch (UnknownHostException e) {
logger.info("try again\n");
}
if (localAddr != null) {
return;
}
// other way to get local IP
Enumeration<InetAddress> localAddrs;
try {
NetworkInterface ni = NetworkInterface.getByName("eth7");
// modify your network interface name
NetworkInterface ni = NetworkInterface.getByName(networkInterface);
if (ni == null) {
logger.error("choose NetworkInterface\n" + getNetworkInterface());
return;
}
localAddrs = ni.getInetAddresses();
if (localAddrs == null || !localAddrs.hasMoreElements()) {
logger.error("choose NetworkInterface\n" + getNetworkInterface());
return;
}
while (localAddrs.hasMoreElements()) {
InetAddress tmp = localAddrs.nextElement();
if (!tmp.isLoopbackAddress() && !tmp.isLinkLocalAddress() && !(tmp instanceof Inet6Address)) {
@ -49,12 +67,11 @@ public class ProxyUtil {
logger.error("Failure when init ProxyUtil", e);
logger.error("choose NetworkInterface\n" + getNetworkInterface());
}
}
public static boolean validateProxy(HttpHost p) {
if (localAddr == null) {
logger.error("cannot get local ip");
logger.error("cannot get local IP");
return false;
}
boolean isReachable = false;
@ -81,7 +98,8 @@ public class ProxyUtil {
}
private static String getNetworkInterface() {
String networkInterfaceName = "";
String networkInterfaceName = ">>>> modify networkInterface in us.codecraft.webmagic.utils.ProxyUtils";
Enumeration<NetworkInterface> enumeration = null;
try {
enumeration = NetworkInterface.getNetworkInterfaces();
@ -90,10 +108,14 @@ public class ProxyUtil {
}
while (enumeration.hasMoreElements()) {
NetworkInterface networkInterface = enumeration.nextElement();
networkInterfaceName += networkInterface.toString() + '\n';
Enumeration<InetAddress> addr = networkInterface.getInetAddresses();
while (addr.hasMoreElements()) {
networkInterfaceName += "\tip:" + addr.nextElement().getHostAddress() + "\n";
String s = addr.nextElement().getHostAddress();
Pattern IPV4_PATTERN = Pattern.compile("^(25[0-5]|2[0-4]\\d|[0-1]?\\d?\\d)(\\.(25[0-5]|2[0-4]\\d|[0-1]?\\d?\\d)){3}$");
if (s != null && IPV4_PATTERN.matcher(s).matches()) {
networkInterfaceName += networkInterface.toString() + "IP:" + s + "\n\n";
}
}
}
return networkInterfaceName;

@ -0,0 +1,79 @@
package us.codecraft.webmagic.proxy;
import static org.assertj.core.api.Assertions.assertThat;
import java.io.File;
import java.util.ArrayList;
import java.util.List;
import org.apache.http.HttpHost;
import org.junit.BeforeClass;
import org.junit.Test;
import us.codecraft.webmagic.Request;
/**
* @author yxssfxwzy@sina.com May 30, 2014
*
*/
public class ProxyTest {
private static List<String[]> httpProxyList = new ArrayList<String[]>();
@BeforeClass
public static void before() {
// String[] source = { "0.0.0.1:0", "0.0.0.2:0", "0.0.0.3:0",
// "0.0.0.4:0" };
String[] source = { "0.0.0.1:0", "0.0.0.2:0", "0.0.0.3:0", "0.0.0.4:0" };
for (String line : source) {
httpProxyList.add(new String[] { line.split(":")[0], line.split(":")[1] });
}
}
@Test
public void testAddProxy() {
}
@Test
public void testProxy() {
ProxyPool proxyPool = new ProxyPool(httpProxyList);
proxyPool.setReuseInterval(500);
assertThat(proxyPool.getIdleNum()).isEqualTo(4);
assertThat(new File(proxyPool.getProxyFilePath()).exists()).isEqualTo(true);
for (int i = 0; i < 2; i++) {
List<Fetch> fetchList = new ArrayList<Fetch>();
while (proxyPool.getIdleNum() != 0) {
HttpHost httphost = proxyPool.getProxy();
// httphostList.add(httphost);
System.out.println(httphost.getHostName() + ":" + httphost.getPort());
Fetch tmp = new Fetch(httphost);
tmp.start();
fetchList.add(tmp);
}
for (Fetch fetch : fetchList) {
proxyPool.returnProxy(fetch.hp, Proxy.SUCCESS);
}
System.out.println(proxyPool.allProxyStatus());
}
}
class Fetch extends Thread {
HttpHost hp;
public Fetch(HttpHost hp) {
this.hp = hp;
}
@Override
public void run() {
try {
System.out.println("fetch web page use proxy: " + hp.getHostName() + ":" + hp.getPort());
sleep(500);
} catch (InterruptedException e) {
e.printStackTrace();
}
}
}
}
Loading…
Cancel
Save