Common the downloader status process and pass error information when onError

pull/1082/head
vio.ao 2 years ago
parent 16221e391d
commit d01f26333b

@ -26,7 +26,7 @@ public abstract class AbstractDownloader implements Downloader {
/** /**
* A simple method to download a url. * A simple method to download a url.
* *
* @param url url * @param url url
* @param charset charset * @param charset charset
* @return html * @return html
*/ */
@ -38,7 +38,7 @@ public abstract class AbstractDownloader implements Downloader {
protected void onSuccess(Request request) { protected void onSuccess(Request request) {
} }
protected void onError(Request request) { protected void onError(Request request, Throwable e) {
} }
} }

@ -87,7 +87,7 @@ public class HttpClientDownloader extends AbstractDownloader {
return page; return page;
} catch (IOException e) { } catch (IOException e) {
logger.warn("download page {} error", request.getUrl(), e); logger.warn("download page {} error", request.getUrl(), e);
onError(request); onError(request, e);
return page; return page;
} finally { } finally {
if (httpResponse != null) { if (httpResponse != null) {
@ -110,7 +110,7 @@ public class HttpClientDownloader extends AbstractDownloader {
String contentType = httpResponse.getEntity().getContentType() == null ? "" : httpResponse.getEntity().getContentType().getValue(); String contentType = httpResponse.getEntity().getContentType() == null ? "" : httpResponse.getEntity().getContentType().getValue();
Page page = new Page(); Page page = new Page();
page.setBytes(bytes); page.setBytes(bytes);
if (!request.isBinaryContent()){ if (!request.isBinaryContent()) {
if (charset == null) { if (charset == null) {
charset = getHtmlCharset(contentType, bytes); charset = getHtmlCharset(contentType, bytes);
} }

@ -16,73 +16,70 @@ import java.io.*;
* @version 0.5.3 * @version 0.5.3
*/ */
public class PhantomJSDownloader extends AbstractDownloader { public class PhantomJSDownloader extends AbstractDownloader {
private static final Logger logger = LoggerFactory.getLogger(PhantomJSDownloader.class);
private static Logger logger = LoggerFactory.getLogger(PhantomJSDownloader.class);
private static String crawlJsPath; private static String crawlJsPath;
private static String phantomJsCommand = "phantomjs"; // default private static String phantomJsCommand = "phantomjs"; // default
private int retryNum;
private int threadNum;
public PhantomJSDownloader() { public PhantomJSDownloader() {
this.initPhantomjsCrawlPath(); this.initPhantomjsCrawlPath();
} }
/** /**
* phantomjs * phantomjs
* * <p>
* example: * example:
* phantomjs.exe windows * phantomjs.exe windows
* phantomjs --ignore-ssl-errors=yes https * phantomjs --ignore-ssl-errors=yes https
* /usr/local/bin/phantomjs IOException * /usr/local/bin/phantomjs IOException
* *
* @param phantomJsCommand phantomJsCommand * @param phantomJsCommand phantomJsCommand
*/ */
public PhantomJSDownloader(String phantomJsCommand) { public PhantomJSDownloader(String phantomJsCommand) {
this.initPhantomjsCrawlPath(); this.initPhantomjsCrawlPath();
PhantomJSDownloader.phantomJsCommand = phantomJsCommand; PhantomJSDownloader.phantomJsCommand = phantomJsCommand;
} }
/** /**
* crawl.jsjarruntime.exec()phantomjs使jarcrawl.js * crawl.jsjarruntime.exec()phantomjs使jarcrawl.js
* <pre> * <pre>
* crawl.js start -- * crawl.js start --
* *
* var system = require('system'); * var system = require('system');
* var url = system.args[1]; * var url = system.args[1];
* *
* var page = require('webpage').create(); * var page = require('webpage').create();
* page.settings.loadImages = false; * page.settings.loadImages = false;
* page.settings.resourceTimeout = 5000; * page.settings.resourceTimeout = 5000;
* *
* page.open(url, function (status) { * page.open(url, function (status) {
* if (status != 'success') { * if (status != 'success') {
* console.log("HTTP request failed!"); * console.log("HTTP request failed!");
* } else { * } else {
* console.log(page.content); * console.log(page.content);
* } * }
* *
* page.close(); * page.close();
* phantom.exit(); * phantom.exit();
* }); * });
* *
* -- crawl.js end * -- crawl.js end
* </pre> * </pre>
* js使 * js使
* * <p>
* example: * example:
* new PhantomJSDownloader("/your/path/phantomjs", "/your/path/crawl.js"); * new PhantomJSDownloader("/your/path/phantomjs", "/your/path/crawl.js");
* *
* @param phantomJsCommand phantomJsCommand * @param phantomJsCommand phantomJsCommand
* @param crawlJsPath crawlJsPath * @param crawlJsPath crawlJsPath
*/ */
public PhantomJSDownloader(String phantomJsCommand, String crawlJsPath) { public PhantomJSDownloader(String phantomJsCommand, String crawlJsPath) {
PhantomJSDownloader.phantomJsCommand = phantomJsCommand; PhantomJSDownloader.phantomJsCommand = phantomJsCommand;
PhantomJSDownloader.crawlJsPath = crawlJsPath; PhantomJSDownloader.crawlJsPath = crawlJsPath;
} }
private void initPhantomjsCrawlPath() { private void initPhantomjsCrawlPath() {
PhantomJSDownloader.crawlJsPath = new File(this.getClass().getResource("/").getPath()).getPath() + System.getProperty("file.separator") + "crawl.js "; PhantomJSDownloader.crawlJsPath = new File(this.getClass().getResource("/").getPath()).getPath()
+ System.getProperty("file.separator") + "crawl.js ";
} }
@Override @Override
@ -90,61 +87,41 @@ public class PhantomJSDownloader extends AbstractDownloader {
if (logger.isInfoEnabled()) { if (logger.isInfoEnabled()) {
logger.info("downloading page: " + request.getUrl()); logger.info("downloading page: " + request.getUrl());
} }
String content = getPage(request);
if (content.contains("HTTP request failed")) { Page page = Page.fail();
for (int i = 1; i <= getRetryNum(); i++) { try {
content = getPage(request); String content = getPage(request);
if (!content.contains("HTTP request failed")) { if (!content.contains("HTTP request failed")) {
break; page.setDownloadSuccess(true);
} page.setRawText(content);
} page.setUrl(new PlainText(request.getUrl()));
if (content.contains("HTTP request failed")) {
//when failed
Page page = new Page();
page.setRequest(request); page.setRequest(request);
return page; page.setStatusCode(200);
} }
onSuccess(request);
} catch (Exception e) {
onError(request, e);
logger.warn("download page {} error", request.getUrl(), e);
} }
Page page = new Page();
page.setRawText(content);
page.setUrl(new PlainText(request.getUrl()));
page.setRequest(request);
page.setStatusCode(200);
return page; return page;
} }
@Override @Override
public void setThread(int threadNum) { public void setThread(int threadNum) {
this.threadNum = threadNum; // ignore
} }
protected String getPage(Request request) { protected String getPage(Request request) throws Exception {
try { String url = request.getUrl();
String url = request.getUrl(); Runtime runtime = Runtime.getRuntime();
Runtime runtime = Runtime.getRuntime(); Process process = runtime.exec(phantomJsCommand + " " + crawlJsPath + " " + url);
Process process = runtime.exec(phantomJsCommand + " " + crawlJsPath + " " + url); InputStream is = process.getInputStream();
InputStream is = process.getInputStream(); BufferedReader br = new BufferedReader(new InputStreamReader(is));
BufferedReader br = new BufferedReader(new InputStreamReader(is)); StringBuilder builder = new StringBuilder();
StringBuffer stringBuffer = new StringBuffer(); String line;
String line; while ((line = br.readLine()) != null) {
while ((line = br.readLine()) != null) { builder.append(line).append("\n");
stringBuffer.append(line).append("\n");
}
return stringBuffer.toString();
} catch (IOException e) {
e.printStackTrace();
} }
return builder.toString();
return null;
}
public int getRetryNum() {
return retryNum;
}
public PhantomJSDownloader setRetryNum(int retryNum) {
this.retryNum = retryNum;
return this;
} }
} }

@ -11,7 +11,7 @@ import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Request; import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Task; import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.downloader.Downloader; import us.codecraft.webmagic.downloader.AbstractDownloader;
import us.codecraft.webmagic.selector.Html; import us.codecraft.webmagic.selector.Html;
import us.codecraft.webmagic.selector.PlainText; import us.codecraft.webmagic.selector.PlainText;
@ -24,112 +24,120 @@ import java.util.Map;
* Selenium driver<br> * Selenium driver<br>
* *
* @author code4crafter@gmail.com <br> * @author code4crafter@gmail.com <br>
* Date: 13-7-26 <br> * Date: 13-7-26 <br>
* Time: 1:37 <br> * Time: 1:37 <br>
*/ */
public class SeleniumDownloader implements Downloader, Closeable { public class SeleniumDownloader extends AbstractDownloader implements Closeable {
private volatile WebDriverPool webDriverPool; private volatile WebDriverPool webDriverPool;
private Logger logger = LoggerFactory.getLogger(getClass()); private Logger logger = LoggerFactory.getLogger(getClass());
private int sleepTime = 0; private int sleepTime = 0;
private int poolSize = 1; private int poolSize = 1;
private static final String DRIVER_PHANTOMJS = "phantomjs"; private static final String DRIVER_PHANTOMJS = "phantomjs";
/** /**
* *
* *
* @param chromeDriverPath chromeDriverPath * @param chromeDriverPath chromeDriverPath
*/ */
public SeleniumDownloader(String chromeDriverPath) { public SeleniumDownloader(String chromeDriverPath) {
System.getProperties().setProperty("webdriver.chrome.driver", System.getProperties().setProperty("webdriver.chrome.driver",
chromeDriverPath); chromeDriverPath);
} }
/** /**
* Constructor without any filed. Construct PhantomJS browser * Constructor without any filed. Construct PhantomJS browser
* *
* @author bob.li.0718@gmail.com * @author bob.li.0718@gmail.com
*/ */
public SeleniumDownloader() { public SeleniumDownloader() {
// System.setProperty("phantomjs.binary.path", // System.setProperty("phantomjs.binary.path",
// "/Users/Bingo/Downloads/phantomjs-1.9.7-macosx/bin/phantomjs"); // "/Users/Bingo/Downloads/phantomjs-1.9.7-macosx/bin/phantomjs");
} }
/** /**
* set sleep time to wait until load success * set sleep time to wait until load success
* *
* @param sleepTime sleepTime * @param sleepTime sleepTime
* @return this * @return this
*/ */
public SeleniumDownloader setSleepTime(int sleepTime) { public SeleniumDownloader setSleepTime(int sleepTime) {
this.sleepTime = sleepTime; this.sleepTime = sleepTime;
return this; return this;
} }
@Override @Override
public Page download(Request request, Task task) { public Page download(Request request, Task task) {
checkInit(); checkInit();
WebDriver webDriver; WebDriver webDriver = null;
try { Page page = Page.fail();
webDriver = webDriverPool.get(); try {
} catch (InterruptedException e) { webDriver = webDriverPool.get();
logger.warn("interrupted", e);
return null; logger.info("downloading page " + request.getUrl());
} webDriver.get(request.getUrl());
logger.info("downloading page " + request.getUrl()); try {
webDriver.get(request.getUrl()); if (sleepTime > 0) {
try { Thread.sleep(sleepTime);
Thread.sleep(sleepTime); }
} catch (InterruptedException e) { } catch (InterruptedException e) {
e.printStackTrace(); e.printStackTrace();
} }
WebDriver.Options manage = webDriver.manage(); WebDriver.Options manage = webDriver.manage();
Site site = task.getSite(); Site site = task.getSite();
if (site.getCookies() != null) { if (site.getCookies() != null) {
for (Map.Entry<String, String> cookieEntry : site.getCookies() for (Map.Entry<String, String> cookieEntry : site.getCookies()
.entrySet()) { .entrySet()) {
Cookie cookie = new Cookie(cookieEntry.getKey(), Cookie cookie = new Cookie(cookieEntry.getKey(),
cookieEntry.getValue()); cookieEntry.getValue());
manage.addCookie(cookie); manage.addCookie(cookie);
} }
} }
/* /*
* TODO You can add mouse event or other processes * TODO You can add mouse event or other processes
* *
* @author: bob.li.0718@gmail.com * @author: bob.li.0718@gmail.com
*/ */
WebElement webElement = webDriver.findElement(By.xpath("/html")); WebElement webElement = webDriver.findElement(By.xpath("/html"));
String content = webElement.getAttribute("outerHTML"); String content = webElement.getAttribute("outerHTML");
Page page = new Page(); page.setDownloadSuccess(true);
page.setRawText(content); page.setRawText(content);
page.setHtml(new Html(content, request.getUrl())); page.setHtml(new Html(content, request.getUrl()));
page.setUrl(new PlainText(request.getUrl())); page.setUrl(new PlainText(request.getUrl()));
page.setRequest(request); page.setRequest(request);
webDriverPool.returnToPool(webDriver); onSuccess(request);
return page; } catch (Exception e) {
} logger.warn("download page {} error", request.getUrl(), e);
onError(request, e);
private void checkInit() { } finally {
if (webDriverPool == null) { if (webDriver != null) {
synchronized (this) { webDriverPool.returnToPool(webDriver);
webDriverPool = new WebDriverPool(poolSize); }
} }
} return page;
} }
@Override private void checkInit() {
public void setThread(int thread) { if (webDriverPool == null) {
this.poolSize = thread; synchronized (this) {
} webDriverPool = new WebDriverPool(poolSize);
}
@Override }
public void close() throws IOException { }
webDriverPool.closeAll();
} @Override
public void setThread(int thread) {
this.poolSize = thread;
}
@Override
public void close() throws IOException {
webDriverPool.closeAll();
}
} }

Loading…
Cancel
Save