diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java index 6f6453b7..07aad87a 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java @@ -424,6 +424,8 @@ public class Spider implements Runnable, Task { pipeline.process(page.getResultItems(), this); } } + //for proxy status management + request.putExtra(Request.STATUS_CODE, page.getStatusCode()); sleep(site.getSleepTime()); } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/Proxy.java b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/Proxy.java index 27e6b52a..5ae9ffd7 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/Proxy.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/Proxy.java @@ -9,7 +9,8 @@ import java.util.concurrent.TimeUnit; import org.apache.http.HttpHost; /** - * >>>>Proxy Status + * >>>> Proxy lifecycle + +----------+ +-----+ | last use | | new | +-----+----+ +---+-+ @@ -44,13 +45,22 @@ import org.apache.http.HttpHost; | |+-------------------+ +--------+ */ + +/** + * Object has these status of lifecycle above.
+ * + * @author yxssfxwzy@sina.com
+ * @since 0.5.1 + * @see ProxyPool + */ + public class Proxy implements Delayed, Serializable { private static final long serialVersionUID = 228939737383625551L; public static final int ERROR_403 = 403; public static final int ERROR_404 = 404; - public static final int ERROR_BANNED = 10000; - public static final int ERROR_Proxy = 10001; + public static final int ERROR_BANNED = 10000;// banned by website + public static final int ERROR_Proxy = 10001;// the proxy itself failed public static final int SUCCESS = 200; private final HttpHost httpHost; @@ -59,7 +69,6 @@ public class Proxy implements Delayed, Serializable { private Long canReuseTime = 0L; private Long lastBorrowTime = System.currentTimeMillis(); private Long responseTime = 0L; - private Long idleTime = 0L; private int failedNum = 0; private int successNum = 0; @@ -143,7 +152,7 @@ public class Proxy implements Delayed, Serializable { @Override public long getDelay(TimeUnit unit) { - return unit.convert(canReuseTime - System.nanoTime(), unit.NANOSECONDS); + return unit.convert(canReuseTime - System.nanoTime(), TimeUnit.NANOSECONDS); } @Override diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ProxyPool.java b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ProxyPool.java index 73c5ed6f..d7ad2756 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ProxyPool.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ProxyPool.java @@ -1,26 +1,39 @@ package us.codecraft.webmagic.proxy; -import org.apache.http.HttpHost; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.io.*; +import java.io.File; +import java.io.FileInputStream; +import java.io.FileNotFoundException; +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.ObjectInputStream; +import java.io.ObjectOutputStream; import java.net.InetAddress; import java.net.UnknownHostException; -import java.util.*; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.NoSuchElementException; +import java.util.Timer; +import java.util.TimerTask; import java.util.Map.Entry; import java.util.concurrent.BlockingQueue; import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.DelayQueue; +import org.apache.http.HttpHost; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import us.codecraft.webmagic.utils.FilePersistentBase; +import us.codecraft.webmagic.utils.ProxyUtils; + /** - * ClassName:ProxyPool + * Pooled Proxy Object * - * @see - * @Function: TODO ADD FUNCTION - * @author ch - * @version Ver 1.0 - * @Date 2014-2-14 下午01:10:04 + * @author yxssfxwzy@sina.com
+ * @since 0.5.1 + * @see Proxy */ public class ProxyPool { @@ -31,10 +44,14 @@ public class ProxyPool { private int reuseInterval = 1500;// ms private int reviveTime = 2 * 60 * 60 * 1000;// ms + private int saveProxyInterval = 10 * 60 * 1000;// ms private boolean isEnable = false; private boolean validateWhenInit = false; - private String proxyFile = "data/lastUse.proxy"; + // private boolean isUseLastProxy = true; + private String proxyFilePath = "/data/webmagic/lastUse.proxy"; + + private FilePersistentBase fBase = new FilePersistentBase(); private Timer timer = new Timer(true); private TimerTask saveProxyTask = new TimerTask() { @@ -47,13 +64,46 @@ public class ProxyPool { }; public ProxyPool() { - + this(null, true); } public ProxyPool(List httpProxyList) { - readProxyList(); - addProxy(httpProxyList.toArray(new String[httpProxyList.size()][])); - timer.schedule(saveProxyTask, 10 * 60 * 1000L, 10 * 60 * 1000); + this(httpProxyList, true); + } + + public ProxyPool(List httpProxyList, boolean isUseLastProxy) { + if (httpProxyList != null) { + addProxy(httpProxyList.toArray(new String[httpProxyList.size()][])); + } + if (isUseLastProxy) { + if (!new File(proxyFilePath).exists()) { + setFilePath(); + } + setFilePath(); + readProxyList(); + timer.schedule(saveProxyTask, 0, saveProxyInterval); + } + } + + private void setFilePath() { + String tmpDir = System.getProperty("java.io.tmpdir"); + String path = tmpDir + "webmagic\\lastUse.proxy"; + if (tmpDir != null && new File(tmpDir).isDirectory()) { + fBase.setPath(tmpDir + "webmagic"); + File f = fBase.getFile(path); + if (!f.exists()) { + try { + f.createNewFile(); + + } catch (IOException e) { + logger.error("proxy file create error", e); + } + } + + } else { + logger.error("java tmp dir not exists"); + } + this.proxyFilePath = path; } private void saveProxyList() { @@ -61,7 +111,7 @@ public class ProxyPool { return; } try { - ObjectOutputStream os = new ObjectOutputStream(new FileOutputStream(proxyFile)); + ObjectOutputStream os = new ObjectOutputStream(new FileOutputStream(fBase.getFile(proxyFilePath))); os.writeObject(prepareForSaving()); os.close(); logger.info("save proxy"); @@ -84,15 +134,15 @@ public class ProxyPool { private void readProxyList() { try { - ObjectInputStream is = new ObjectInputStream(new FileInputStream(proxyFile)); + ObjectInputStream is = new ObjectInputStream(new FileInputStream(fBase.getFile(proxyFilePath))); addProxy((Map) is.readObject()); is.close(); } catch (FileNotFoundException e) { - logger.error("proxy file not found", e); + logger.info("last use proxy file not found", e); } catch (IOException e) { - e.printStackTrace(); + // e.printStackTrace(); } catch (ClassNotFoundException e) { - e.printStackTrace(); + // e.printStackTrace(); } } @@ -103,7 +153,7 @@ public class ProxyPool { if (allProxy.containsKey(entry.getKey())) { continue; } - if (!validateWhenInit || ProxyUtil.validateProxy(entry.getValue().getHttpHost())) { + if (!validateWhenInit || ProxyUtils.validateProxy(entry.getValue().getHttpHost())) { entry.getValue().setFailedNum(0); entry.getValue().setReuseTimeInterval(reuseInterval); proxyQueue.add(entry.getValue()); @@ -124,7 +174,7 @@ public class ProxyPool { continue; } HttpHost item = new HttpHost(InetAddress.getByName(s[0]), Integer.valueOf(s[1])); - if (!validateWhenInit || ProxyUtil.validateProxy(item)) { + if (!validateWhenInit || ProxyUtils.validateProxy(item)) { Proxy p = new Proxy(item, reuseInterval); proxyQueue.add(p); allProxy.put(s[0], p); @@ -173,7 +223,7 @@ public class ProxyPool { p.successNumIncrement(1); break; case Proxy.ERROR_403: - // banned,try larger interval + // banned,try longer interval p.fail(Proxy.ERROR_403); p.setReuseTimeInterval(reuseInterval * p.getFailedNum()); logger.info(host + " >>>> reuseTimeInterval is >>>> " + p.getReuseTimeInterval() / 1000.0); @@ -185,7 +235,7 @@ public class ProxyPool { logger.info(host + " >>>> reuseTimeInterval is >>>> " + p.getReuseTimeInterval() / 1000.0); break; case Proxy.ERROR_404: - //p.fail(Proxy.ERROR_404); + // p.fail(Proxy.ERROR_404); // p.setReuseTimeInterval(reuseInterval * p.getFailedNum()); break; default: @@ -193,14 +243,12 @@ public class ProxyPool { break; } if (p.getFailedNum() > 20) { - // allProxy.remove(host.getAddress().getHostAddress()); p.setReuseTimeInterval(reviveTime); logger.error("remove proxy >>>> " + host + ">>>>" + p.getFailedType() + " >>>> remain proxy >>>> " + proxyQueue.size()); return; } - if (p.getFailedNum()%5==0) { - if (!ProxyUtil.validateProxy(host)) { - // allProxy.remove(host.getAddress().getHostAddress()); + if (p.getFailedNum() > 0 && p.getFailedNum() % 5 == 0) { + if (!ProxyUtils.validateProxy(host)) { p.setReuseTimeInterval(reviveTime); logger.error("remove proxy >>>> " + host + ">>>>" + p.getFailedType() + " >>>> remain proxy >>>> " + proxyQueue.size()); return; @@ -219,7 +267,6 @@ public class ProxyPool { re += entry.getValue().toString() + "\n"; } return re; - } public int getIdleNum() { @@ -234,57 +281,44 @@ public class ProxyPool { this.reuseInterval = reuseInterval; } - public static List getProxyList() { - List proxyList = new ArrayList(); - BufferedReader br = null; - try { - br = new BufferedReader(new FileReader(new File("proxy.txt"))); + public void enable(boolean isEnable) { + this.isEnable = isEnable; + } - String line = ""; - while ((line = br.readLine()) != null) { - proxyList.add(new String[] { line.split(":")[0], line.split(":")[1] }); - } - } catch (FileNotFoundException e) { - e.printStackTrace(); - } catch (IOException e) { - e.printStackTrace(); - } - return proxyList; + public boolean isEnable() { + return isEnable; } - public static void main(String[] args) throws IOException { - ProxyPool proxyPool = new ProxyPool(getProxyList()); - proxyPool.setReuseInterval(10000); - // proxyPool.saveProxyList(); - - while (true) { - List httphostList = new ArrayList(); - System.in.read(); - int i = 0; - while (proxyPool.getIdleNum() > 2) { - HttpHost httphost = proxyPool.getProxy(); - httphostList.add(httphost); - // proxyPool.proxyPool.use(httphost); - proxyPool.logger.info("borrow object>>>>" + i + ">>>>" + httphostList.get(i).toString()); - i++; - } - System.out.println(proxyPool.allProxyStatus()); - System.in.read(); - for (i = 0; i < httphostList.size(); i++) { - proxyPool.returnProxy(httphostList.get(i), 200); - proxyPool.logger.info("return object>>>>" + i + ">>>>" + httphostList.get(i).toString()); - } - System.out.println(proxyPool.allProxyStatus()); - System.in.read(); - } + public int getReviveTime() { + return reviveTime; + } + public void setReviveTime(int reviveTime) { + this.reviveTime = reviveTime; } - public void enable(boolean isEnable) { - this.isEnable = isEnable; + public boolean isValidateWhenInit() { + return validateWhenInit; } - public boolean isEnable() { - return isEnable; + public void validateWhenInit(boolean validateWhenInit) { + this.validateWhenInit = validateWhenInit; + } + + public int getSaveProxyInterval() { + return saveProxyInterval; } + + public void setSaveProxyInterval(int saveProxyInterval) { + this.saveProxyInterval = saveProxyInterval; + } + + public String getProxyFilePath() { + return proxyFilePath; + } + + public void setProxyFilePath(String proxyFilePath) { + this.proxyFilePath = proxyFilePath; + } + } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ProxyUtil.java b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/ProxyUtils.java similarity index 67% rename from webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ProxyUtil.java rename to webmagic-core/src/main/java/us/codecraft/webmagic/utils/ProxyUtils.java index f045e0d6..f44c2ac7 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ProxyUtil.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/ProxyUtils.java @@ -1,4 +1,4 @@ -package us.codecraft.webmagic.proxy; +package us.codecraft.webmagic.utils; import java.io.IOException; import java.net.Inet6Address; @@ -7,36 +7,54 @@ import java.net.InetSocketAddress; import java.net.NetworkInterface; import java.net.Socket; import java.net.SocketException; +import java.net.UnknownHostException; import java.util.Enumeration; +import java.util.regex.Pattern; import org.apache.http.HttpHost; import org.slf4j.Logger; import org.slf4j.LoggerFactory; /** - * ClassName:ProxyUtil + * Pooled Proxy Object * - * @see - * @author ch - * @version Ver 1.0 - * @Date 2014-2-16 下午04:20:07 + * @author yxssfxwzy@sina.com
+ * @since 0.5.1 */ -public class ProxyUtil { - // TODO 改为单例 + +public class ProxyUtils { private static InetAddress localAddr; - private static final Logger logger = LoggerFactory.getLogger(ProxyUtil.class); + private static String networkInterface = "eth7"; + + private static final Logger logger = LoggerFactory.getLogger(ProxyUtils.class); static { init(); } private static void init() { + // first way to get local IP + try { + localAddr = InetAddress.getLocalHost(); + logger.info("local IP:" + localAddr.getHostAddress()); + } catch (UnknownHostException e) { + logger.info("try again\n"); + } + if (localAddr != null) { + return; + } + // other way to get local IP Enumeration localAddrs; try { - NetworkInterface ni = NetworkInterface.getByName("eth7"); + // modify your network interface name + NetworkInterface ni = NetworkInterface.getByName(networkInterface); if (ni == null) { - logger.error("choose NetworkInterface\n" + getNetworkInterface()); + return; } localAddrs = ni.getInetAddresses(); + if (localAddrs == null || !localAddrs.hasMoreElements()) { + logger.error("choose NetworkInterface\n" + getNetworkInterface()); + return; + } while (localAddrs.hasMoreElements()) { InetAddress tmp = localAddrs.nextElement(); if (!tmp.isLoopbackAddress() && !tmp.isLinkLocalAddress() && !(tmp instanceof Inet6Address)) { @@ -49,12 +67,11 @@ public class ProxyUtil { logger.error("Failure when init ProxyUtil", e); logger.error("choose NetworkInterface\n" + getNetworkInterface()); } - } public static boolean validateProxy(HttpHost p) { if (localAddr == null) { - logger.error("cannot get local ip"); + logger.error("cannot get local IP"); return false; } boolean isReachable = false; @@ -81,7 +98,8 @@ public class ProxyUtil { } private static String getNetworkInterface() { - String networkInterfaceName = ""; + + String networkInterfaceName = ">>>> modify networkInterface in us.codecraft.webmagic.utils.ProxyUtils"; Enumeration enumeration = null; try { enumeration = NetworkInterface.getNetworkInterfaces(); @@ -90,10 +108,14 @@ public class ProxyUtil { } while (enumeration.hasMoreElements()) { NetworkInterface networkInterface = enumeration.nextElement(); - networkInterfaceName += networkInterface.toString() + '\n'; + Enumeration addr = networkInterface.getInetAddresses(); while (addr.hasMoreElements()) { - networkInterfaceName += "\tip:" + addr.nextElement().getHostAddress() + "\n"; + String s = addr.nextElement().getHostAddress(); + Pattern IPV4_PATTERN = Pattern.compile("^(25[0-5]|2[0-4]\\d|[0-1]?\\d?\\d)(\\.(25[0-5]|2[0-4]\\d|[0-1]?\\d?\\d)){3}$"); + if (s != null && IPV4_PATTERN.matcher(s).matches()) { + networkInterfaceName += networkInterface.toString() + "IP:" + s + "\n\n"; + } } } return networkInterfaceName; diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/proxy/ProxyTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/proxy/ProxyTest.java new file mode 100644 index 00000000..9d3d420d --- /dev/null +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/proxy/ProxyTest.java @@ -0,0 +1,79 @@ +package us.codecraft.webmagic.proxy; + +import static org.assertj.core.api.Assertions.assertThat; + +import java.io.File; +import java.util.ArrayList; +import java.util.List; + +import org.apache.http.HttpHost; +import org.junit.BeforeClass; +import org.junit.Test; + +import us.codecraft.webmagic.Request; + +/** + * @author yxssfxwzy@sina.com May 30, 2014 + * + */ +public class ProxyTest { + + private static List httpProxyList = new ArrayList(); + + @BeforeClass + public static void before() { + // String[] source = { "0.0.0.1:0", "0.0.0.2:0", "0.0.0.3:0", + // "0.0.0.4:0" }; + String[] source = { "0.0.0.1:0", "0.0.0.2:0", "0.0.0.3:0", "0.0.0.4:0" }; + for (String line : source) { + httpProxyList.add(new String[] { line.split(":")[0], line.split(":")[1] }); + } + } + + @Test + public void testAddProxy() { + + } + + @Test + public void testProxy() { + ProxyPool proxyPool = new ProxyPool(httpProxyList); + proxyPool.setReuseInterval(500); + assertThat(proxyPool.getIdleNum()).isEqualTo(4); + assertThat(new File(proxyPool.getProxyFilePath()).exists()).isEqualTo(true); + for (int i = 0; i < 2; i++) { + List fetchList = new ArrayList(); + while (proxyPool.getIdleNum() != 0) { + HttpHost httphost = proxyPool.getProxy(); + // httphostList.add(httphost); + System.out.println(httphost.getHostName() + ":" + httphost.getPort()); + Fetch tmp = new Fetch(httphost); + tmp.start(); + fetchList.add(tmp); + } + for (Fetch fetch : fetchList) { + proxyPool.returnProxy(fetch.hp, Proxy.SUCCESS); + } + System.out.println(proxyPool.allProxyStatus()); + + } + } + + class Fetch extends Thread { + HttpHost hp; + + public Fetch(HttpHost hp) { + this.hp = hp; + } + + @Override + public void run() { + try { + System.out.println("fetch web page use proxy: " + hp.getHostName() + ":" + hp.getPort()); + sleep(500); + } catch (InterruptedException e) { + e.printStackTrace(); + } + } + } +}