diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/AbstractDownloader.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/AbstractDownloader.java index c27292d0..2f9b1123 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/AbstractDownloader.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/AbstractDownloader.java @@ -26,7 +26,7 @@ public abstract class AbstractDownloader implements Downloader { /** * A simple method to download a url. * - * @param url url + * @param url url * @param charset charset * @return html */ @@ -38,7 +38,7 @@ public abstract class AbstractDownloader implements Downloader { protected void onSuccess(Request request) { } - protected void onError(Request request) { + protected void onError(Request request, Throwable e) { } } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java index 49217e11..89b60389 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java @@ -87,7 +87,7 @@ public class HttpClientDownloader extends AbstractDownloader { return page; } catch (IOException e) { logger.warn("download page {} error", request.getUrl(), e); - onError(request); + onError(request, e); return page; } finally { if (httpResponse != null) { @@ -110,7 +110,7 @@ public class HttpClientDownloader extends AbstractDownloader { String contentType = httpResponse.getEntity().getContentType() == null ? "" : httpResponse.getEntity().getContentType().getValue(); Page page = new Page(); page.setBytes(bytes); - if (!request.isBinaryContent()){ + if (!request.isBinaryContent()) { if (charset == null) { charset = getHtmlCharset(contentType, bytes); } diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/downloader/PhantomJSDownloader.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/downloader/PhantomJSDownloader.java index 6055bdb0..88b8237e 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/downloader/PhantomJSDownloader.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/downloader/PhantomJSDownloader.java @@ -16,73 +16,70 @@ import java.io.*; * @version 0.5.3 */ public class PhantomJSDownloader extends AbstractDownloader { - - private static Logger logger = LoggerFactory.getLogger(PhantomJSDownloader.class); + private static final Logger logger = LoggerFactory.getLogger(PhantomJSDownloader.class); private static String crawlJsPath; private static String phantomJsCommand = "phantomjs"; // default - private int retryNum; - private int threadNum; - public PhantomJSDownloader() { this.initPhantomjsCrawlPath(); } - + /** * 添加新的构造函数,支持phantomjs自定义命令 - * - * example: - * phantomjs.exe 支持windows环境 - * phantomjs --ignore-ssl-errors=yes 忽略抓取地址是https时的一些错误 - * /usr/local/bin/phantomjs 命令的绝对路径,避免因系统环境变量引起的IOException - * + *

+ * example: + * phantomjs.exe 支持windows环境 + * phantomjs --ignore-ssl-errors=yes 忽略抓取地址是https时的一些错误 + * /usr/local/bin/phantomjs 命令的绝对路径,避免因系统环境变量引起的IOException + * * @param phantomJsCommand phantomJsCommand */ public PhantomJSDownloader(String phantomJsCommand) { this.initPhantomjsCrawlPath(); PhantomJSDownloader.phantomJsCommand = phantomJsCommand; } - + /** * 新增构造函数,支持crawl.js路径自定义,因为当其他项目依赖此jar包时,runtime.exec()执行phantomjs命令时无使用法jar包中的crawl.js *

      * crawl.js start --
-     * 
+     *
      *   var system = require('system');
      *   var url = system.args[1];
-     *   
+     *
      *   var page = require('webpage').create();
      *   page.settings.loadImages = false;
      *   page.settings.resourceTimeout = 5000;
-     *   
+     *
      *   page.open(url, function (status) {
      *       if (status != 'success') {
      *           console.log("HTTP request failed!");
      *       } else {
      *           console.log(page.content);
      *       }
-     *   
+     *
      *       page.close();
      *       phantom.exit();
      *   });
-     *   
+     *
      * -- crawl.js end
      * 
* 具体项目时可以将以上js代码复制下来使用 - * + *

* example: - * new PhantomJSDownloader("/your/path/phantomjs", "/your/path/crawl.js"); - * + * new PhantomJSDownloader("/your/path/phantomjs", "/your/path/crawl.js"); + * * @param phantomJsCommand phantomJsCommand - * @param crawlJsPath crawlJsPath + * @param crawlJsPath crawlJsPath */ public PhantomJSDownloader(String phantomJsCommand, String crawlJsPath) { - PhantomJSDownloader.phantomJsCommand = phantomJsCommand; - PhantomJSDownloader.crawlJsPath = crawlJsPath; + PhantomJSDownloader.phantomJsCommand = phantomJsCommand; + PhantomJSDownloader.crawlJsPath = crawlJsPath; } - + private void initPhantomjsCrawlPath() { - PhantomJSDownloader.crawlJsPath = new File(this.getClass().getResource("/").getPath()).getPath() + System.getProperty("file.separator") + "crawl.js "; + PhantomJSDownloader.crawlJsPath = new File(this.getClass().getResource("/").getPath()).getPath() + + System.getProperty("file.separator") + "crawl.js "; } @Override @@ -90,61 +87,41 @@ public class PhantomJSDownloader extends AbstractDownloader { if (logger.isInfoEnabled()) { logger.info("downloading page: " + request.getUrl()); } - String content = getPage(request); - if (content.contains("HTTP request failed")) { - for (int i = 1; i <= getRetryNum(); i++) { - content = getPage(request); - if (!content.contains("HTTP request failed")) { - break; - } - } - if (content.contains("HTTP request failed")) { - //when failed - Page page = new Page(); + + Page page = Page.fail(); + try { + String content = getPage(request); + if (!content.contains("HTTP request failed")) { + page.setDownloadSuccess(true); + page.setRawText(content); + page.setUrl(new PlainText(request.getUrl())); page.setRequest(request); - return page; + page.setStatusCode(200); } + onSuccess(request); + } catch (Exception e) { + onError(request, e); + logger.warn("download page {} error", request.getUrl(), e); } - - Page page = new Page(); - page.setRawText(content); - page.setUrl(new PlainText(request.getUrl())); - page.setRequest(request); - page.setStatusCode(200); return page; } @Override public void setThread(int threadNum) { - this.threadNum = threadNum; + // ignore } - protected String getPage(Request request) { - try { - String url = request.getUrl(); - Runtime runtime = Runtime.getRuntime(); - Process process = runtime.exec(phantomJsCommand + " " + crawlJsPath + " " + url); - InputStream is = process.getInputStream(); - BufferedReader br = new BufferedReader(new InputStreamReader(is)); - StringBuffer stringBuffer = new StringBuffer(); - String line; - while ((line = br.readLine()) != null) { - stringBuffer.append(line).append("\n"); - } - return stringBuffer.toString(); - } catch (IOException e) { - e.printStackTrace(); + protected String getPage(Request request) throws Exception { + String url = request.getUrl(); + Runtime runtime = Runtime.getRuntime(); + Process process = runtime.exec(phantomJsCommand + " " + crawlJsPath + " " + url); + InputStream is = process.getInputStream(); + BufferedReader br = new BufferedReader(new InputStreamReader(is)); + StringBuilder builder = new StringBuilder(); + String line; + while ((line = br.readLine()) != null) { + builder.append(line).append("\n"); } - - return null; - } - - public int getRetryNum() { - return retryNum; - } - - public PhantomJSDownloader setRetryNum(int retryNum) { - this.retryNum = retryNum; - return this; + return builder.toString(); } } diff --git a/webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/SeleniumDownloader.java b/webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/SeleniumDownloader.java index cce293fc..df601b4f 100644 --- a/webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/SeleniumDownloader.java +++ b/webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/SeleniumDownloader.java @@ -11,7 +11,7 @@ import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Request; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Task; -import us.codecraft.webmagic.downloader.Downloader; +import us.codecraft.webmagic.downloader.AbstractDownloader; import us.codecraft.webmagic.selector.Html; import us.codecraft.webmagic.selector.PlainText; @@ -24,112 +24,120 @@ import java.util.Map; * 需要下载Selenium driver支持。
* * @author code4crafter@gmail.com
- * Date: 13-7-26
- * Time: 下午1:37
+ * Date: 13-7-26
+ * Time: 下午1:37
*/ -public class SeleniumDownloader implements Downloader, Closeable { - - private volatile WebDriverPool webDriverPool; - - private Logger logger = LoggerFactory.getLogger(getClass()); - - private int sleepTime = 0; - - private int poolSize = 1; - - private static final String DRIVER_PHANTOMJS = "phantomjs"; - - /** - * 新建 - * - * @param chromeDriverPath chromeDriverPath - */ - public SeleniumDownloader(String chromeDriverPath) { - System.getProperties().setProperty("webdriver.chrome.driver", - chromeDriverPath); - } - - /** - * Constructor without any filed. Construct PhantomJS browser - * - * @author bob.li.0718@gmail.com - */ - public SeleniumDownloader() { - // System.setProperty("phantomjs.binary.path", - // "/Users/Bingo/Downloads/phantomjs-1.9.7-macosx/bin/phantomjs"); - } - - /** - * set sleep time to wait until load success - * - * @param sleepTime sleepTime - * @return this - */ - public SeleniumDownloader setSleepTime(int sleepTime) { - this.sleepTime = sleepTime; - return this; - } - - @Override - public Page download(Request request, Task task) { - checkInit(); - WebDriver webDriver; - try { - webDriver = webDriverPool.get(); - } catch (InterruptedException e) { - logger.warn("interrupted", e); - return null; - } - logger.info("downloading page " + request.getUrl()); - webDriver.get(request.getUrl()); - try { - Thread.sleep(sleepTime); - } catch (InterruptedException e) { - e.printStackTrace(); - } - WebDriver.Options manage = webDriver.manage(); - Site site = task.getSite(); - if (site.getCookies() != null) { - for (Map.Entry cookieEntry : site.getCookies() - .entrySet()) { - Cookie cookie = new Cookie(cookieEntry.getKey(), - cookieEntry.getValue()); - manage.addCookie(cookie); - } - } - - /* - * TODO You can add mouse event or other processes - * - * @author: bob.li.0718@gmail.com - */ - - WebElement webElement = webDriver.findElement(By.xpath("/html")); - String content = webElement.getAttribute("outerHTML"); - Page page = new Page(); - page.setRawText(content); - page.setHtml(new Html(content, request.getUrl())); - page.setUrl(new PlainText(request.getUrl())); - page.setRequest(request); - webDriverPool.returnToPool(webDriver); - return page; - } - - private void checkInit() { - if (webDriverPool == null) { - synchronized (this) { - webDriverPool = new WebDriverPool(poolSize); - } - } - } - - @Override - public void setThread(int thread) { - this.poolSize = thread; - } - - @Override - public void close() throws IOException { - webDriverPool.closeAll(); - } +public class SeleniumDownloader extends AbstractDownloader implements Closeable { + + private volatile WebDriverPool webDriverPool; + + private Logger logger = LoggerFactory.getLogger(getClass()); + + private int sleepTime = 0; + + private int poolSize = 1; + + private static final String DRIVER_PHANTOMJS = "phantomjs"; + + /** + * 新建 + * + * @param chromeDriverPath chromeDriverPath + */ + public SeleniumDownloader(String chromeDriverPath) { + System.getProperties().setProperty("webdriver.chrome.driver", + chromeDriverPath); + } + + /** + * Constructor without any filed. Construct PhantomJS browser + * + * @author bob.li.0718@gmail.com + */ + public SeleniumDownloader() { + // System.setProperty("phantomjs.binary.path", + // "/Users/Bingo/Downloads/phantomjs-1.9.7-macosx/bin/phantomjs"); + } + + /** + * set sleep time to wait until load success + * + * @param sleepTime sleepTime + * @return this + */ + public SeleniumDownloader setSleepTime(int sleepTime) { + this.sleepTime = sleepTime; + return this; + } + + @Override + public Page download(Request request, Task task) { + checkInit(); + WebDriver webDriver = null; + Page page = Page.fail(); + try { + webDriver = webDriverPool.get(); + + logger.info("downloading page " + request.getUrl()); + webDriver.get(request.getUrl()); + try { + if (sleepTime > 0) { + Thread.sleep(sleepTime); + } + } catch (InterruptedException e) { + e.printStackTrace(); + } + WebDriver.Options manage = webDriver.manage(); + Site site = task.getSite(); + if (site.getCookies() != null) { + for (Map.Entry cookieEntry : site.getCookies() + .entrySet()) { + Cookie cookie = new Cookie(cookieEntry.getKey(), + cookieEntry.getValue()); + manage.addCookie(cookie); + } + } + + /* + * TODO You can add mouse event or other processes + * + * @author: bob.li.0718@gmail.com + */ + + WebElement webElement = webDriver.findElement(By.xpath("/html")); + String content = webElement.getAttribute("outerHTML"); + page.setDownloadSuccess(true); + page.setRawText(content); + page.setHtml(new Html(content, request.getUrl())); + page.setUrl(new PlainText(request.getUrl())); + page.setRequest(request); + onSuccess(request); + } catch (Exception e) { + logger.warn("download page {} error", request.getUrl(), e); + onError(request, e); + } finally { + if (webDriver != null) { + webDriverPool.returnToPool(webDriver); + } + } + return page; + } + + private void checkInit() { + if (webDriverPool == null) { + synchronized (this) { + webDriverPool = new WebDriverPool(poolSize); + } + } + } + + @Override + public void setThread(int thread) { + this.poolSize = thread; + } + + @Override + public void close() throws IOException { + webDriverPool.closeAll(); + } }