diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java
index 6f6453b7..07aad87a 100644
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java
@@ -424,6 +424,8 @@ public class Spider implements Runnable, Task {
pipeline.process(page.getResultItems(), this);
}
}
+ //for proxy status management
+ request.putExtra(Request.STATUS_CODE, page.getStatusCode());
sleep(site.getSleepTime());
}
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/Proxy.java b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/Proxy.java
index 27e6b52a..5ae9ffd7 100644
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/Proxy.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/Proxy.java
@@ -9,7 +9,8 @@ import java.util.concurrent.TimeUnit;
import org.apache.http.HttpHost;
/**
- * >>>>Proxy Status
+ * >>>> Proxy lifecycle
+
+----------+ +-----+
| last use | | new |
+-----+----+ +---+-+
@@ -44,13 +45,22 @@ import org.apache.http.HttpHost;
| |+-------------------+
+--------+
*/
+
+/**
+ * Object has these status of lifecycle above.
+ *
+ * @author yxssfxwzy@sina.com
+ * @since 0.5.1
+ * @see ProxyPool
+ */
+
public class Proxy implements Delayed, Serializable {
private static final long serialVersionUID = 228939737383625551L;
public static final int ERROR_403 = 403;
public static final int ERROR_404 = 404;
- public static final int ERROR_BANNED = 10000;
- public static final int ERROR_Proxy = 10001;
+ public static final int ERROR_BANNED = 10000;// banned by website
+ public static final int ERROR_Proxy = 10001;// the proxy itself failed
public static final int SUCCESS = 200;
private final HttpHost httpHost;
@@ -59,7 +69,6 @@ public class Proxy implements Delayed, Serializable {
private Long canReuseTime = 0L;
private Long lastBorrowTime = System.currentTimeMillis();
private Long responseTime = 0L;
- private Long idleTime = 0L;
private int failedNum = 0;
private int successNum = 0;
@@ -143,7 +152,7 @@ public class Proxy implements Delayed, Serializable {
@Override
public long getDelay(TimeUnit unit) {
- return unit.convert(canReuseTime - System.nanoTime(), unit.NANOSECONDS);
+ return unit.convert(canReuseTime - System.nanoTime(), TimeUnit.NANOSECONDS);
}
@Override
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ProxyPool.java b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ProxyPool.java
index 73c5ed6f..d7ad2756 100644
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ProxyPool.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ProxyPool.java
@@ -1,26 +1,39 @@
package us.codecraft.webmagic.proxy;
-import org.apache.http.HttpHost;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import java.io.*;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileNotFoundException;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.ObjectInputStream;
+import java.io.ObjectOutputStream;
import java.net.InetAddress;
import java.net.UnknownHostException;
-import java.util.*;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.NoSuchElementException;
+import java.util.Timer;
+import java.util.TimerTask;
import java.util.Map.Entry;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.DelayQueue;
+import org.apache.http.HttpHost;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import us.codecraft.webmagic.utils.FilePersistentBase;
+import us.codecraft.webmagic.utils.ProxyUtils;
+
/**
- * ClassName:ProxyPool
+ * Pooled Proxy Object
*
- * @see
- * @Function: TODO ADD FUNCTION
- * @author ch
- * @version Ver 1.0
- * @Date 2014-2-14 下午01:10:04
+ * @author yxssfxwzy@sina.com
+ * @since 0.5.1
+ * @see Proxy
*/
public class ProxyPool {
@@ -31,10 +44,14 @@ public class ProxyPool {
private int reuseInterval = 1500;// ms
private int reviveTime = 2 * 60 * 60 * 1000;// ms
+ private int saveProxyInterval = 10 * 60 * 1000;// ms
private boolean isEnable = false;
private boolean validateWhenInit = false;
- private String proxyFile = "data/lastUse.proxy";
+ // private boolean isUseLastProxy = true;
+ private String proxyFilePath = "/data/webmagic/lastUse.proxy";
+
+ private FilePersistentBase fBase = new FilePersistentBase();
private Timer timer = new Timer(true);
private TimerTask saveProxyTask = new TimerTask() {
@@ -47,13 +64,46 @@ public class ProxyPool {
};
public ProxyPool() {
-
+ this(null, true);
}
public ProxyPool(List httpProxyList) {
- readProxyList();
- addProxy(httpProxyList.toArray(new String[httpProxyList.size()][]));
- timer.schedule(saveProxyTask, 10 * 60 * 1000L, 10 * 60 * 1000);
+ this(httpProxyList, true);
+ }
+
+ public ProxyPool(List httpProxyList, boolean isUseLastProxy) {
+ if (httpProxyList != null) {
+ addProxy(httpProxyList.toArray(new String[httpProxyList.size()][]));
+ }
+ if (isUseLastProxy) {
+ if (!new File(proxyFilePath).exists()) {
+ setFilePath();
+ }
+ setFilePath();
+ readProxyList();
+ timer.schedule(saveProxyTask, 0, saveProxyInterval);
+ }
+ }
+
+ private void setFilePath() {
+ String tmpDir = System.getProperty("java.io.tmpdir");
+ String path = tmpDir + "webmagic\\lastUse.proxy";
+ if (tmpDir != null && new File(tmpDir).isDirectory()) {
+ fBase.setPath(tmpDir + "webmagic");
+ File f = fBase.getFile(path);
+ if (!f.exists()) {
+ try {
+ f.createNewFile();
+
+ } catch (IOException e) {
+ logger.error("proxy file create error", e);
+ }
+ }
+
+ } else {
+ logger.error("java tmp dir not exists");
+ }
+ this.proxyFilePath = path;
}
private void saveProxyList() {
@@ -61,7 +111,7 @@ public class ProxyPool {
return;
}
try {
- ObjectOutputStream os = new ObjectOutputStream(new FileOutputStream(proxyFile));
+ ObjectOutputStream os = new ObjectOutputStream(new FileOutputStream(fBase.getFile(proxyFilePath)));
os.writeObject(prepareForSaving());
os.close();
logger.info("save proxy");
@@ -84,15 +134,15 @@ public class ProxyPool {
private void readProxyList() {
try {
- ObjectInputStream is = new ObjectInputStream(new FileInputStream(proxyFile));
+ ObjectInputStream is = new ObjectInputStream(new FileInputStream(fBase.getFile(proxyFilePath)));
addProxy((Map) is.readObject());
is.close();
} catch (FileNotFoundException e) {
- logger.error("proxy file not found", e);
+ logger.info("last use proxy file not found", e);
} catch (IOException e) {
- e.printStackTrace();
+ // e.printStackTrace();
} catch (ClassNotFoundException e) {
- e.printStackTrace();
+ // e.printStackTrace();
}
}
@@ -103,7 +153,7 @@ public class ProxyPool {
if (allProxy.containsKey(entry.getKey())) {
continue;
}
- if (!validateWhenInit || ProxyUtil.validateProxy(entry.getValue().getHttpHost())) {
+ if (!validateWhenInit || ProxyUtils.validateProxy(entry.getValue().getHttpHost())) {
entry.getValue().setFailedNum(0);
entry.getValue().setReuseTimeInterval(reuseInterval);
proxyQueue.add(entry.getValue());
@@ -124,7 +174,7 @@ public class ProxyPool {
continue;
}
HttpHost item = new HttpHost(InetAddress.getByName(s[0]), Integer.valueOf(s[1]));
- if (!validateWhenInit || ProxyUtil.validateProxy(item)) {
+ if (!validateWhenInit || ProxyUtils.validateProxy(item)) {
Proxy p = new Proxy(item, reuseInterval);
proxyQueue.add(p);
allProxy.put(s[0], p);
@@ -173,7 +223,7 @@ public class ProxyPool {
p.successNumIncrement(1);
break;
case Proxy.ERROR_403:
- // banned,try larger interval
+ // banned,try longer interval
p.fail(Proxy.ERROR_403);
p.setReuseTimeInterval(reuseInterval * p.getFailedNum());
logger.info(host + " >>>> reuseTimeInterval is >>>> " + p.getReuseTimeInterval() / 1000.0);
@@ -185,7 +235,7 @@ public class ProxyPool {
logger.info(host + " >>>> reuseTimeInterval is >>>> " + p.getReuseTimeInterval() / 1000.0);
break;
case Proxy.ERROR_404:
- //p.fail(Proxy.ERROR_404);
+ // p.fail(Proxy.ERROR_404);
// p.setReuseTimeInterval(reuseInterval * p.getFailedNum());
break;
default:
@@ -193,14 +243,12 @@ public class ProxyPool {
break;
}
if (p.getFailedNum() > 20) {
- // allProxy.remove(host.getAddress().getHostAddress());
p.setReuseTimeInterval(reviveTime);
logger.error("remove proxy >>>> " + host + ">>>>" + p.getFailedType() + " >>>> remain proxy >>>> " + proxyQueue.size());
return;
}
- if (p.getFailedNum()%5==0) {
- if (!ProxyUtil.validateProxy(host)) {
- // allProxy.remove(host.getAddress().getHostAddress());
+ if (p.getFailedNum() > 0 && p.getFailedNum() % 5 == 0) {
+ if (!ProxyUtils.validateProxy(host)) {
p.setReuseTimeInterval(reviveTime);
logger.error("remove proxy >>>> " + host + ">>>>" + p.getFailedType() + " >>>> remain proxy >>>> " + proxyQueue.size());
return;
@@ -219,7 +267,6 @@ public class ProxyPool {
re += entry.getValue().toString() + "\n";
}
return re;
-
}
public int getIdleNum() {
@@ -234,57 +281,44 @@ public class ProxyPool {
this.reuseInterval = reuseInterval;
}
- public static List getProxyList() {
- List proxyList = new ArrayList();
- BufferedReader br = null;
- try {
- br = new BufferedReader(new FileReader(new File("proxy.txt")));
+ public void enable(boolean isEnable) {
+ this.isEnable = isEnable;
+ }
- String line = "";
- while ((line = br.readLine()) != null) {
- proxyList.add(new String[] { line.split(":")[0], line.split(":")[1] });
- }
- } catch (FileNotFoundException e) {
- e.printStackTrace();
- } catch (IOException e) {
- e.printStackTrace();
- }
- return proxyList;
+ public boolean isEnable() {
+ return isEnable;
}
- public static void main(String[] args) throws IOException {
- ProxyPool proxyPool = new ProxyPool(getProxyList());
- proxyPool.setReuseInterval(10000);
- // proxyPool.saveProxyList();
-
- while (true) {
- List httphostList = new ArrayList();
- System.in.read();
- int i = 0;
- while (proxyPool.getIdleNum() > 2) {
- HttpHost httphost = proxyPool.getProxy();
- httphostList.add(httphost);
- // proxyPool.proxyPool.use(httphost);
- proxyPool.logger.info("borrow object>>>>" + i + ">>>>" + httphostList.get(i).toString());
- i++;
- }
- System.out.println(proxyPool.allProxyStatus());
- System.in.read();
- for (i = 0; i < httphostList.size(); i++) {
- proxyPool.returnProxy(httphostList.get(i), 200);
- proxyPool.logger.info("return object>>>>" + i + ">>>>" + httphostList.get(i).toString());
- }
- System.out.println(proxyPool.allProxyStatus());
- System.in.read();
- }
+ public int getReviveTime() {
+ return reviveTime;
+ }
+ public void setReviveTime(int reviveTime) {
+ this.reviveTime = reviveTime;
}
- public void enable(boolean isEnable) {
- this.isEnable = isEnable;
+ public boolean isValidateWhenInit() {
+ return validateWhenInit;
}
- public boolean isEnable() {
- return isEnable;
+ public void validateWhenInit(boolean validateWhenInit) {
+ this.validateWhenInit = validateWhenInit;
+ }
+
+ public int getSaveProxyInterval() {
+ return saveProxyInterval;
}
+
+ public void setSaveProxyInterval(int saveProxyInterval) {
+ this.saveProxyInterval = saveProxyInterval;
+ }
+
+ public String getProxyFilePath() {
+ return proxyFilePath;
+ }
+
+ public void setProxyFilePath(String proxyFilePath) {
+ this.proxyFilePath = proxyFilePath;
+ }
+
}
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ProxyUtil.java b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/ProxyUtils.java
similarity index 67%
rename from webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ProxyUtil.java
rename to webmagic-core/src/main/java/us/codecraft/webmagic/utils/ProxyUtils.java
index f045e0d6..f44c2ac7 100644
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ProxyUtil.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/ProxyUtils.java
@@ -1,4 +1,4 @@
-package us.codecraft.webmagic.proxy;
+package us.codecraft.webmagic.utils;
import java.io.IOException;
import java.net.Inet6Address;
@@ -7,36 +7,54 @@ import java.net.InetSocketAddress;
import java.net.NetworkInterface;
import java.net.Socket;
import java.net.SocketException;
+import java.net.UnknownHostException;
import java.util.Enumeration;
+import java.util.regex.Pattern;
import org.apache.http.HttpHost;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
- * ClassName:ProxyUtil
+ * Pooled Proxy Object
*
- * @see
- * @author ch
- * @version Ver 1.0
- * @Date 2014-2-16 下午04:20:07
+ * @author yxssfxwzy@sina.com
+ * @since 0.5.1
*/
-public class ProxyUtil {
- // TODO 改为单例
+
+public class ProxyUtils {
private static InetAddress localAddr;
- private static final Logger logger = LoggerFactory.getLogger(ProxyUtil.class);
+ private static String networkInterface = "eth7";
+
+ private static final Logger logger = LoggerFactory.getLogger(ProxyUtils.class);
static {
init();
}
private static void init() {
+ // first way to get local IP
+ try {
+ localAddr = InetAddress.getLocalHost();
+ logger.info("local IP:" + localAddr.getHostAddress());
+ } catch (UnknownHostException e) {
+ logger.info("try again\n");
+ }
+ if (localAddr != null) {
+ return;
+ }
+ // other way to get local IP
Enumeration localAddrs;
try {
- NetworkInterface ni = NetworkInterface.getByName("eth7");
+ // modify your network interface name
+ NetworkInterface ni = NetworkInterface.getByName(networkInterface);
if (ni == null) {
- logger.error("choose NetworkInterface\n" + getNetworkInterface());
+ return;
}
localAddrs = ni.getInetAddresses();
+ if (localAddrs == null || !localAddrs.hasMoreElements()) {
+ logger.error("choose NetworkInterface\n" + getNetworkInterface());
+ return;
+ }
while (localAddrs.hasMoreElements()) {
InetAddress tmp = localAddrs.nextElement();
if (!tmp.isLoopbackAddress() && !tmp.isLinkLocalAddress() && !(tmp instanceof Inet6Address)) {
@@ -49,12 +67,11 @@ public class ProxyUtil {
logger.error("Failure when init ProxyUtil", e);
logger.error("choose NetworkInterface\n" + getNetworkInterface());
}
-
}
public static boolean validateProxy(HttpHost p) {
if (localAddr == null) {
- logger.error("cannot get local ip");
+ logger.error("cannot get local IP");
return false;
}
boolean isReachable = false;
@@ -81,7 +98,8 @@ public class ProxyUtil {
}
private static String getNetworkInterface() {
- String networkInterfaceName = "";
+
+ String networkInterfaceName = ">>>> modify networkInterface in us.codecraft.webmagic.utils.ProxyUtils";
Enumeration enumeration = null;
try {
enumeration = NetworkInterface.getNetworkInterfaces();
@@ -90,10 +108,14 @@ public class ProxyUtil {
}
while (enumeration.hasMoreElements()) {
NetworkInterface networkInterface = enumeration.nextElement();
- networkInterfaceName += networkInterface.toString() + '\n';
+
Enumeration addr = networkInterface.getInetAddresses();
while (addr.hasMoreElements()) {
- networkInterfaceName += "\tip:" + addr.nextElement().getHostAddress() + "\n";
+ String s = addr.nextElement().getHostAddress();
+ Pattern IPV4_PATTERN = Pattern.compile("^(25[0-5]|2[0-4]\\d|[0-1]?\\d?\\d)(\\.(25[0-5]|2[0-4]\\d|[0-1]?\\d?\\d)){3}$");
+ if (s != null && IPV4_PATTERN.matcher(s).matches()) {
+ networkInterfaceName += networkInterface.toString() + "IP:" + s + "\n\n";
+ }
}
}
return networkInterfaceName;
diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/proxy/ProxyTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/proxy/ProxyTest.java
new file mode 100644
index 00000000..9d3d420d
--- /dev/null
+++ b/webmagic-core/src/test/java/us/codecraft/webmagic/proxy/ProxyTest.java
@@ -0,0 +1,79 @@
+package us.codecraft.webmagic.proxy;
+
+import static org.assertj.core.api.Assertions.assertThat;
+
+import java.io.File;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.http.HttpHost;
+import org.junit.BeforeClass;
+import org.junit.Test;
+
+import us.codecraft.webmagic.Request;
+
+/**
+ * @author yxssfxwzy@sina.com May 30, 2014
+ *
+ */
+public class ProxyTest {
+
+ private static List httpProxyList = new ArrayList();
+
+ @BeforeClass
+ public static void before() {
+ // String[] source = { "0.0.0.1:0", "0.0.0.2:0", "0.0.0.3:0",
+ // "0.0.0.4:0" };
+ String[] source = { "0.0.0.1:0", "0.0.0.2:0", "0.0.0.3:0", "0.0.0.4:0" };
+ for (String line : source) {
+ httpProxyList.add(new String[] { line.split(":")[0], line.split(":")[1] });
+ }
+ }
+
+ @Test
+ public void testAddProxy() {
+
+ }
+
+ @Test
+ public void testProxy() {
+ ProxyPool proxyPool = new ProxyPool(httpProxyList);
+ proxyPool.setReuseInterval(500);
+ assertThat(proxyPool.getIdleNum()).isEqualTo(4);
+ assertThat(new File(proxyPool.getProxyFilePath()).exists()).isEqualTo(true);
+ for (int i = 0; i < 2; i++) {
+ List fetchList = new ArrayList();
+ while (proxyPool.getIdleNum() != 0) {
+ HttpHost httphost = proxyPool.getProxy();
+ // httphostList.add(httphost);
+ System.out.println(httphost.getHostName() + ":" + httphost.getPort());
+ Fetch tmp = new Fetch(httphost);
+ tmp.start();
+ fetchList.add(tmp);
+ }
+ for (Fetch fetch : fetchList) {
+ proxyPool.returnProxy(fetch.hp, Proxy.SUCCESS);
+ }
+ System.out.println(proxyPool.allProxyStatus());
+
+ }
+ }
+
+ class Fetch extends Thread {
+ HttpHost hp;
+
+ public Fetch(HttpHost hp) {
+ this.hp = hp;
+ }
+
+ @Override
+ public void run() {
+ try {
+ System.out.println("fetch web page use proxy: " + hp.getHostName() + ":" + hp.getPort());
+ sleep(500);
+ } catch (InterruptedException e) {
+ e.printStackTrace();
+ }
+ }
+ }
+}