diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/AbstractDownloader.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/AbstractDownloader.java index c27292d0..2f9b1123 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/AbstractDownloader.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/AbstractDownloader.java @@ -26,7 +26,7 @@ public abstract class AbstractDownloader implements Downloader { /** * A simple method to download a url. * - * @param url url + * @param url url * @param charset charset * @return html */ @@ -38,7 +38,7 @@ public abstract class AbstractDownloader implements Downloader { protected void onSuccess(Request request) { } - protected void onError(Request request) { + protected void onError(Request request, Throwable e) { } } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java index 49217e11..89b60389 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java @@ -87,7 +87,7 @@ public class HttpClientDownloader extends AbstractDownloader { return page; } catch (IOException e) { logger.warn("download page {} error", request.getUrl(), e); - onError(request); + onError(request, e); return page; } finally { if (httpResponse != null) { @@ -110,7 +110,7 @@ public class HttpClientDownloader extends AbstractDownloader { String contentType = httpResponse.getEntity().getContentType() == null ? "" : httpResponse.getEntity().getContentType().getValue(); Page page = new Page(); page.setBytes(bytes); - if (!request.isBinaryContent()){ + if (!request.isBinaryContent()) { if (charset == null) { charset = getHtmlCharset(contentType, bytes); } diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/downloader/PhantomJSDownloader.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/downloader/PhantomJSDownloader.java index 6055bdb0..88b8237e 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/downloader/PhantomJSDownloader.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/downloader/PhantomJSDownloader.java @@ -16,73 +16,70 @@ import java.io.*; * @version 0.5.3 */ public class PhantomJSDownloader extends AbstractDownloader { - - private static Logger logger = LoggerFactory.getLogger(PhantomJSDownloader.class); + private static final Logger logger = LoggerFactory.getLogger(PhantomJSDownloader.class); private static String crawlJsPath; private static String phantomJsCommand = "phantomjs"; // default - private int retryNum; - private int threadNum; - public PhantomJSDownloader() { this.initPhantomjsCrawlPath(); } - + /** * 添加新的构造函数,支持phantomjs自定义命令 - * - * example: - * phantomjs.exe 支持windows环境 - * phantomjs --ignore-ssl-errors=yes 忽略抓取地址是https时的一些错误 - * /usr/local/bin/phantomjs 命令的绝对路径,避免因系统环境变量引起的IOException - * + *
+ * example: + * phantomjs.exe 支持windows环境 + * phantomjs --ignore-ssl-errors=yes 忽略抓取地址是https时的一些错误 + * /usr/local/bin/phantomjs 命令的绝对路径,避免因系统环境变量引起的IOException + * * @param phantomJsCommand phantomJsCommand */ public PhantomJSDownloader(String phantomJsCommand) { this.initPhantomjsCrawlPath(); PhantomJSDownloader.phantomJsCommand = phantomJsCommand; } - + /** * 新增构造函数,支持crawl.js路径自定义,因为当其他项目依赖此jar包时,runtime.exec()执行phantomjs命令时无使用法jar包中的crawl.js *
* crawl.js start -- - * + * * var system = require('system'); * var url = system.args[1]; - * + * * var page = require('webpage').create(); * page.settings.loadImages = false; * page.settings.resourceTimeout = 5000; - * + * * page.open(url, function (status) { * if (status != 'success') { * console.log("HTTP request failed!"); * } else { * console.log(page.content); * } - * + * * page.close(); * phantom.exit(); * }); - * + * * -- crawl.js end ** 具体项目时可以将以上js代码复制下来使用 - * + *
* example:
- * new PhantomJSDownloader("/your/path/phantomjs", "/your/path/crawl.js");
- *
+ * new PhantomJSDownloader("/your/path/phantomjs", "/your/path/crawl.js");
+ *
* @param phantomJsCommand phantomJsCommand
- * @param crawlJsPath crawlJsPath
+ * @param crawlJsPath crawlJsPath
*/
public PhantomJSDownloader(String phantomJsCommand, String crawlJsPath) {
- PhantomJSDownloader.phantomJsCommand = phantomJsCommand;
- PhantomJSDownloader.crawlJsPath = crawlJsPath;
+ PhantomJSDownloader.phantomJsCommand = phantomJsCommand;
+ PhantomJSDownloader.crawlJsPath = crawlJsPath;
}
-
+
private void initPhantomjsCrawlPath() {
- PhantomJSDownloader.crawlJsPath = new File(this.getClass().getResource("/").getPath()).getPath() + System.getProperty("file.separator") + "crawl.js ";
+ PhantomJSDownloader.crawlJsPath = new File(this.getClass().getResource("/").getPath()).getPath()
+ + System.getProperty("file.separator") + "crawl.js ";
}
@Override
@@ -90,61 +87,41 @@ public class PhantomJSDownloader extends AbstractDownloader {
if (logger.isInfoEnabled()) {
logger.info("downloading page: " + request.getUrl());
}
- String content = getPage(request);
- if (content.contains("HTTP request failed")) {
- for (int i = 1; i <= getRetryNum(); i++) {
- content = getPage(request);
- if (!content.contains("HTTP request failed")) {
- break;
- }
- }
- if (content.contains("HTTP request failed")) {
- //when failed
- Page page = new Page();
+
+ Page page = Page.fail();
+ try {
+ String content = getPage(request);
+ if (!content.contains("HTTP request failed")) {
+ page.setDownloadSuccess(true);
+ page.setRawText(content);
+ page.setUrl(new PlainText(request.getUrl()));
page.setRequest(request);
- return page;
+ page.setStatusCode(200);
}
+ onSuccess(request);
+ } catch (Exception e) {
+ onError(request, e);
+ logger.warn("download page {} error", request.getUrl(), e);
}
-
- Page page = new Page();
- page.setRawText(content);
- page.setUrl(new PlainText(request.getUrl()));
- page.setRequest(request);
- page.setStatusCode(200);
return page;
}
@Override
public void setThread(int threadNum) {
- this.threadNum = threadNum;
+ // ignore
}
- protected String getPage(Request request) {
- try {
- String url = request.getUrl();
- Runtime runtime = Runtime.getRuntime();
- Process process = runtime.exec(phantomJsCommand + " " + crawlJsPath + " " + url);
- InputStream is = process.getInputStream();
- BufferedReader br = new BufferedReader(new InputStreamReader(is));
- StringBuffer stringBuffer = new StringBuffer();
- String line;
- while ((line = br.readLine()) != null) {
- stringBuffer.append(line).append("\n");
- }
- return stringBuffer.toString();
- } catch (IOException e) {
- e.printStackTrace();
+ protected String getPage(Request request) throws Exception {
+ String url = request.getUrl();
+ Runtime runtime = Runtime.getRuntime();
+ Process process = runtime.exec(phantomJsCommand + " " + crawlJsPath + " " + url);
+ InputStream is = process.getInputStream();
+ BufferedReader br = new BufferedReader(new InputStreamReader(is));
+ StringBuilder builder = new StringBuilder();
+ String line;
+ while ((line = br.readLine()) != null) {
+ builder.append(line).append("\n");
}
-
- return null;
- }
-
- public int getRetryNum() {
- return retryNum;
- }
-
- public PhantomJSDownloader setRetryNum(int retryNum) {
- this.retryNum = retryNum;
- return this;
+ return builder.toString();
}
}
diff --git a/webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/SeleniumDownloader.java b/webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/SeleniumDownloader.java
index cce293fc..df601b4f 100644
--- a/webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/SeleniumDownloader.java
+++ b/webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/SeleniumDownloader.java
@@ -11,7 +11,7 @@ import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Task;
-import us.codecraft.webmagic.downloader.Downloader;
+import us.codecraft.webmagic.downloader.AbstractDownloader;
import us.codecraft.webmagic.selector.Html;
import us.codecraft.webmagic.selector.PlainText;
@@ -24,112 +24,120 @@ import java.util.Map;
* 需要下载Selenium driver支持。
*
* @author code4crafter@gmail.com
- * Date: 13-7-26
- * Time: 下午1:37
+ * Date: 13-7-26
+ * Time: 下午1:37
*/
-public class SeleniumDownloader implements Downloader, Closeable {
-
- private volatile WebDriverPool webDriverPool;
-
- private Logger logger = LoggerFactory.getLogger(getClass());
-
- private int sleepTime = 0;
-
- private int poolSize = 1;
-
- private static final String DRIVER_PHANTOMJS = "phantomjs";
-
- /**
- * 新建
- *
- * @param chromeDriverPath chromeDriverPath
- */
- public SeleniumDownloader(String chromeDriverPath) {
- System.getProperties().setProperty("webdriver.chrome.driver",
- chromeDriverPath);
- }
-
- /**
- * Constructor without any filed. Construct PhantomJS browser
- *
- * @author bob.li.0718@gmail.com
- */
- public SeleniumDownloader() {
- // System.setProperty("phantomjs.binary.path",
- // "/Users/Bingo/Downloads/phantomjs-1.9.7-macosx/bin/phantomjs");
- }
-
- /**
- * set sleep time to wait until load success
- *
- * @param sleepTime sleepTime
- * @return this
- */
- public SeleniumDownloader setSleepTime(int sleepTime) {
- this.sleepTime = sleepTime;
- return this;
- }
-
- @Override
- public Page download(Request request, Task task) {
- checkInit();
- WebDriver webDriver;
- try {
- webDriver = webDriverPool.get();
- } catch (InterruptedException e) {
- logger.warn("interrupted", e);
- return null;
- }
- logger.info("downloading page " + request.getUrl());
- webDriver.get(request.getUrl());
- try {
- Thread.sleep(sleepTime);
- } catch (InterruptedException e) {
- e.printStackTrace();
- }
- WebDriver.Options manage = webDriver.manage();
- Site site = task.getSite();
- if (site.getCookies() != null) {
- for (Map.Entry