diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/AbstractDownloader.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/AbstractDownloader.java index 2f9b1123..c27292d0 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/AbstractDownloader.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/AbstractDownloader.java @@ -26,7 +26,7 @@ public abstract class AbstractDownloader implements Downloader { /** * A simple method to download a url. * - * @param url url + * @param url url * @param charset charset * @return html */ @@ -38,7 +38,7 @@ public abstract class AbstractDownloader implements Downloader { protected void onSuccess(Request request) { } - protected void onError(Request request, Throwable e) { + protected void onError(Request request) { } } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java index 89b60389..49217e11 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java @@ -87,7 +87,7 @@ public class HttpClientDownloader extends AbstractDownloader { return page; } catch (IOException e) { logger.warn("download page {} error", request.getUrl(), e); - onError(request, e); + onError(request); return page; } finally { if (httpResponse != null) { @@ -110,7 +110,7 @@ public class HttpClientDownloader extends AbstractDownloader { String contentType = httpResponse.getEntity().getContentType() == null ? "" : httpResponse.getEntity().getContentType().getValue(); Page page = new Page(); page.setBytes(bytes); - if (!request.isBinaryContent()) { + if (!request.isBinaryContent()){ if (charset == null) { charset = getHtmlCharset(contentType, bytes); } diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/downloader/PhantomJSDownloader.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/downloader/PhantomJSDownloader.java index 88b8237e..6055bdb0 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/downloader/PhantomJSDownloader.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/downloader/PhantomJSDownloader.java @@ -16,70 +16,73 @@ import java.io.*; * @version 0.5.3 */ public class PhantomJSDownloader extends AbstractDownloader { - private static final Logger logger = LoggerFactory.getLogger(PhantomJSDownloader.class); + + private static Logger logger = LoggerFactory.getLogger(PhantomJSDownloader.class); private static String crawlJsPath; private static String phantomJsCommand = "phantomjs"; // default + private int retryNum; + private int threadNum; + public PhantomJSDownloader() { this.initPhantomjsCrawlPath(); } - + /** * 添加新的构造函数,支持phantomjs自定义命令 - *
- * example: - * phantomjs.exe 支持windows环境 - * phantomjs --ignore-ssl-errors=yes 忽略抓取地址是https时的一些错误 - * /usr/local/bin/phantomjs 命令的绝对路径,避免因系统环境变量引起的IOException - * + * + * example: + * phantomjs.exe 支持windows环境 + * phantomjs --ignore-ssl-errors=yes 忽略抓取地址是https时的一些错误 + * /usr/local/bin/phantomjs 命令的绝对路径,避免因系统环境变量引起的IOException + * * @param phantomJsCommand phantomJsCommand */ public PhantomJSDownloader(String phantomJsCommand) { this.initPhantomjsCrawlPath(); PhantomJSDownloader.phantomJsCommand = phantomJsCommand; } - + /** * 新增构造函数,支持crawl.js路径自定义,因为当其他项目依赖此jar包时,runtime.exec()执行phantomjs命令时无使用法jar包中的crawl.js *
* crawl.js start -- - * + * * var system = require('system'); * var url = system.args[1]; - * + * * var page = require('webpage').create(); * page.settings.loadImages = false; * page.settings.resourceTimeout = 5000; - * + * * page.open(url, function (status) { * if (status != 'success') { * console.log("HTTP request failed!"); * } else { * console.log(page.content); * } - * + * * page.close(); * phantom.exit(); * }); - * + * * -- crawl.js end ** 具体项目时可以将以上js代码复制下来使用 - *
+ *
* example:
- * new PhantomJSDownloader("/your/path/phantomjs", "/your/path/crawl.js");
- *
+ * new PhantomJSDownloader("/your/path/phantomjs", "/your/path/crawl.js");
+ *
* @param phantomJsCommand phantomJsCommand
- * @param crawlJsPath crawlJsPath
+ * @param crawlJsPath crawlJsPath
*/
public PhantomJSDownloader(String phantomJsCommand, String crawlJsPath) {
- PhantomJSDownloader.phantomJsCommand = phantomJsCommand;
- PhantomJSDownloader.crawlJsPath = crawlJsPath;
+ PhantomJSDownloader.phantomJsCommand = phantomJsCommand;
+ PhantomJSDownloader.crawlJsPath = crawlJsPath;
}
-
+
private void initPhantomjsCrawlPath() {
- PhantomJSDownloader.crawlJsPath = new File(this.getClass().getResource("/").getPath()).getPath()
- + System.getProperty("file.separator") + "crawl.js ";
+ PhantomJSDownloader.crawlJsPath = new File(this.getClass().getResource("/").getPath()).getPath() + System.getProperty("file.separator") + "crawl.js ";
}
@Override
@@ -87,41 +90,61 @@ public class PhantomJSDownloader extends AbstractDownloader {
if (logger.isInfoEnabled()) {
logger.info("downloading page: " + request.getUrl());
}
-
- Page page = Page.fail();
- try {
- String content = getPage(request);
- if (!content.contains("HTTP request failed")) {
- page.setDownloadSuccess(true);
- page.setRawText(content);
- page.setUrl(new PlainText(request.getUrl()));
+ String content = getPage(request);
+ if (content.contains("HTTP request failed")) {
+ for (int i = 1; i <= getRetryNum(); i++) {
+ content = getPage(request);
+ if (!content.contains("HTTP request failed")) {
+ break;
+ }
+ }
+ if (content.contains("HTTP request failed")) {
+ //when failed
+ Page page = new Page();
page.setRequest(request);
- page.setStatusCode(200);
+ return page;
}
- onSuccess(request);
- } catch (Exception e) {
- onError(request, e);
- logger.warn("download page {} error", request.getUrl(), e);
}
+
+ Page page = new Page();
+ page.setRawText(content);
+ page.setUrl(new PlainText(request.getUrl()));
+ page.setRequest(request);
+ page.setStatusCode(200);
return page;
}
@Override
public void setThread(int threadNum) {
- // ignore
+ this.threadNum = threadNum;
}
- protected String getPage(Request request) throws Exception {
- String url = request.getUrl();
- Runtime runtime = Runtime.getRuntime();
- Process process = runtime.exec(phantomJsCommand + " " + crawlJsPath + " " + url);
- InputStream is = process.getInputStream();
- BufferedReader br = new BufferedReader(new InputStreamReader(is));
- StringBuilder builder = new StringBuilder();
- String line;
- while ((line = br.readLine()) != null) {
- builder.append(line).append("\n");
+ protected String getPage(Request request) {
+ try {
+ String url = request.getUrl();
+ Runtime runtime = Runtime.getRuntime();
+ Process process = runtime.exec(phantomJsCommand + " " + crawlJsPath + " " + url);
+ InputStream is = process.getInputStream();
+ BufferedReader br = new BufferedReader(new InputStreamReader(is));
+ StringBuffer stringBuffer = new StringBuffer();
+ String line;
+ while ((line = br.readLine()) != null) {
+ stringBuffer.append(line).append("\n");
+ }
+ return stringBuffer.toString();
+ } catch (IOException e) {
+ e.printStackTrace();
}
- return builder.toString();
+
+ return null;
+ }
+
+ public int getRetryNum() {
+ return retryNum;
+ }
+
+ public PhantomJSDownloader setRetryNum(int retryNum) {
+ this.retryNum = retryNum;
+ return this;
}
}
diff --git a/webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/SeleniumDownloader.java b/webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/SeleniumDownloader.java
index df601b4f..cce293fc 100644
--- a/webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/SeleniumDownloader.java
+++ b/webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/SeleniumDownloader.java
@@ -11,7 +11,7 @@ import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Task;
-import us.codecraft.webmagic.downloader.AbstractDownloader;
+import us.codecraft.webmagic.downloader.Downloader;
import us.codecraft.webmagic.selector.Html;
import us.codecraft.webmagic.selector.PlainText;
@@ -24,120 +24,112 @@ import java.util.Map;
* 需要下载Selenium driver支持。
*
* @author code4crafter@gmail.com
- * Date: 13-7-26
- * Time: 下午1:37
+ * Date: 13-7-26
+ * Time: 下午1:37
*/
-public class SeleniumDownloader extends AbstractDownloader implements Closeable {
-
- private volatile WebDriverPool webDriverPool;
-
- private Logger logger = LoggerFactory.getLogger(getClass());
-
- private int sleepTime = 0;
-
- private int poolSize = 1;
-
- private static final String DRIVER_PHANTOMJS = "phantomjs";
-
- /**
- * 新建
- *
- * @param chromeDriverPath chromeDriverPath
- */
- public SeleniumDownloader(String chromeDriverPath) {
- System.getProperties().setProperty("webdriver.chrome.driver",
- chromeDriverPath);
- }
-
- /**
- * Constructor without any filed. Construct PhantomJS browser
- *
- * @author bob.li.0718@gmail.com
- */
- public SeleniumDownloader() {
- // System.setProperty("phantomjs.binary.path",
- // "/Users/Bingo/Downloads/phantomjs-1.9.7-macosx/bin/phantomjs");
- }
-
- /**
- * set sleep time to wait until load success
- *
- * @param sleepTime sleepTime
- * @return this
- */
- public SeleniumDownloader setSleepTime(int sleepTime) {
- this.sleepTime = sleepTime;
- return this;
- }
-
- @Override
- public Page download(Request request, Task task) {
- checkInit();
- WebDriver webDriver = null;
- Page page = Page.fail();
- try {
- webDriver = webDriverPool.get();
-
- logger.info("downloading page " + request.getUrl());
- webDriver.get(request.getUrl());
- try {
- if (sleepTime > 0) {
- Thread.sleep(sleepTime);
- }
- } catch (InterruptedException e) {
- e.printStackTrace();
- }
- WebDriver.Options manage = webDriver.manage();
- Site site = task.getSite();
- if (site.getCookies() != null) {
- for (Map.Entry