diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/BannedChecker.java b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/BannedChecker.java new file mode 100644 index 00000000..db17de2b --- /dev/null +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/BannedChecker.java @@ -0,0 +1,13 @@ +package us.codecraft.webmagic.proxy; + +import org.apache.http.HttpResponse; + +/** + * @author code4crafter@gmail.com + * Date: 17/3/20 + * Time: 下午10:52 + */ +public interface BannedChecker { + + boolean isBanned(HttpResponse httpResponse); +} diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ProxyPool.java b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ProxyPool.java index ad307a6f..fcc1f8df 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ProxyPool.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ProxyPool.java @@ -7,7 +7,7 @@ import us.codecraft.webmagic.Task; */ public interface ProxyPool { - void returnProxy(Proxy proxy, int statusCode, Task task); + void returnProxy(Proxy proxy, boolean banned, Task task); Proxy getProxy(Task task); diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/TimerReuseProxyPool.java b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/TimerReuseProxyPool.java index a336c71d..6fde6047 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/TimerReuseProxyPool.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/TimerReuseProxyPool.java @@ -34,102 +34,11 @@ public class TimerReuseProxyPool implements ProxyPool { private boolean isEnable = false; private boolean validateWhenInit = false; // private boolean isUseLastProxy = true; - private String proxyFilePath = "/data/webmagic/lastUse.proxy"; - - private FilePersistentBase fBase = new FilePersistentBase(); - - private Timer timer = new Timer(true); - private TimerTask saveProxyTask = new TimerTask() { - - @Override - public void run() { - saveProxyList(); - logger.info(allProxyStatus()); - } - }; - - public TimerReuseProxyPool() { - this(null, true); - } - + public TimerReuseProxyPool(List httpProxyList) { this(httpProxyList, true); } - public TimerReuseProxyPool(List httpProxyList, boolean isUseLastProxy) { - if (httpProxyList != null) { - addProxy(httpProxyList.toArray(new String[httpProxyList.size()][])); - } - if (isUseLastProxy) { - if (!new File(proxyFilePath).exists()) { - setFilePath(); - } - readProxyList(); - timer.schedule(saveProxyTask, 0, saveProxyInterval); - } - } - - private void setFilePath() { - String tmpDir = System.getProperty("java.io.tmpdir"); - String path = tmpDir + FilePersistentBase.PATH_SEPERATOR + "webmagic" + FilePersistentBase.PATH_SEPERATOR + "lastUse.proxy"; - if (tmpDir != null && new File(tmpDir).isDirectory()) { - fBase.setPath(tmpDir + FilePersistentBase.PATH_SEPERATOR + "webmagic"); - File f = fBase.getFile(path); - if (!f.exists()) { - try { - f.createNewFile(); - - } catch (IOException e) { - logger.error("proxy file create error", e); - } - } - - } else { - logger.error("java tmp dir not exists"); - } - this.proxyFilePath = path; - } - - private void saveProxyList() { - if (allProxy.size() == 0) { - return; - } - try { - ObjectOutputStream os = new ObjectOutputStream(new FileOutputStream(fBase.getFile(proxyFilePath))); - os.writeObject(prepareForSaving()); - os.close(); - logger.info("save proxy"); - } catch (FileNotFoundException e) { - logger.error("proxy file not found", e); - } catch (IOException e) { - e.printStackTrace(); - } - } - - private Map prepareForSaving() { - Map tmp = new HashMap(); - for (Entry e : allProxy.entrySet()) { - TimerReuseProxy p = e.getValue(); - p.setFailedNum(0); - tmp.put(e.getKey(), p); - } - return tmp; - } - - private void readProxyList() { - try { - ObjectInputStream is = new ObjectInputStream(new FileInputStream(fBase.getFile(proxyFilePath))); - addProxy((Map) is.readObject()); - is.close(); - } catch (FileNotFoundException e) { - logger.info("last use proxy file not found", e); - } catch (IOException e) { - // e.printStackTrace(); - } catch (ClassNotFoundException e) { - // e.printStackTrace(); - } - } - private void addProxy(Map httpProxyMap) { isEnable = true; for (Entry entry : httpProxyMap.entrySet()) { @@ -205,7 +114,6 @@ public class TimerReuseProxyPool implements ProxyPool { case TimerReuseProxy.ERROR_BANNED: p.fail(TimerReuseProxy.ERROR_BANNED); p.setReuseTimeInterval(10 * 60 * 1000 * p.getFailedNum()); - logger.warn("this proxy is banned >>>> " + p.getHttpHost()); logger.info(proxy + " >>>> reuseTimeInterval is >>>> " + p.getReuseTimeInterval() / 1000.0); break; case TimerReuseProxy.ERROR_404: