Common the downloader status process and pass error information when onError

pull/1082/head
vio.ao 2 years ago
parent 16221e391d
commit d01f26333b

@ -38,7 +38,7 @@ public abstract class AbstractDownloader implements Downloader {
protected void onSuccess(Request request) { protected void onSuccess(Request request) {
} }
protected void onError(Request request) { protected void onError(Request request, Throwable e) {
} }
} }

@ -87,7 +87,7 @@ public class HttpClientDownloader extends AbstractDownloader {
return page; return page;
} catch (IOException e) { } catch (IOException e) {
logger.warn("download page {} error", request.getUrl(), e); logger.warn("download page {} error", request.getUrl(), e);
onError(request); onError(request, e);
return page; return page;
} finally { } finally {
if (httpResponse != null) { if (httpResponse != null) {
@ -110,7 +110,7 @@ public class HttpClientDownloader extends AbstractDownloader {
String contentType = httpResponse.getEntity().getContentType() == null ? "" : httpResponse.getEntity().getContentType().getValue(); String contentType = httpResponse.getEntity().getContentType() == null ? "" : httpResponse.getEntity().getContentType().getValue();
Page page = new Page(); Page page = new Page();
page.setBytes(bytes); page.setBytes(bytes);
if (!request.isBinaryContent()){ if (!request.isBinaryContent()) {
if (charset == null) { if (charset == null) {
charset = getHtmlCharset(contentType, bytes); charset = getHtmlCharset(contentType, bytes);
} }

@ -16,21 +16,17 @@ import java.io.*;
* @version 0.5.3 * @version 0.5.3
*/ */
public class PhantomJSDownloader extends AbstractDownloader { public class PhantomJSDownloader extends AbstractDownloader {
private static final Logger logger = LoggerFactory.getLogger(PhantomJSDownloader.class);
private static Logger logger = LoggerFactory.getLogger(PhantomJSDownloader.class);
private static String crawlJsPath; private static String crawlJsPath;
private static String phantomJsCommand = "phantomjs"; // default private static String phantomJsCommand = "phantomjs"; // default
private int retryNum;
private int threadNum;
public PhantomJSDownloader() { public PhantomJSDownloader() {
this.initPhantomjsCrawlPath(); this.initPhantomjsCrawlPath();
} }
/** /**
* phantomjs * phantomjs
* * <p>
* example: * example:
* phantomjs.exe windows * phantomjs.exe windows
* phantomjs --ignore-ssl-errors=yes https * phantomjs --ignore-ssl-errors=yes https
@ -69,7 +65,7 @@ public class PhantomJSDownloader extends AbstractDownloader {
* -- crawl.js end * -- crawl.js end
* </pre> * </pre>
* js使 * js使
* * <p>
* example: * example:
* new PhantomJSDownloader("/your/path/phantomjs", "/your/path/crawl.js"); * new PhantomJSDownloader("/your/path/phantomjs", "/your/path/crawl.js");
* *
@ -82,7 +78,8 @@ public class PhantomJSDownloader extends AbstractDownloader {
} }
private void initPhantomjsCrawlPath() { private void initPhantomjsCrawlPath() {
PhantomJSDownloader.crawlJsPath = new File(this.getClass().getResource("/").getPath()).getPath() + System.getProperty("file.separator") + "crawl.js "; PhantomJSDownloader.crawlJsPath = new File(this.getClass().getResource("/").getPath()).getPath()
+ System.getProperty("file.separator") + "crawl.js ";
} }
@Override @Override
@ -90,61 +87,41 @@ public class PhantomJSDownloader extends AbstractDownloader {
if (logger.isInfoEnabled()) { if (logger.isInfoEnabled()) {
logger.info("downloading page: " + request.getUrl()); logger.info("downloading page: " + request.getUrl());
} }
Page page = Page.fail();
try {
String content = getPage(request); String content = getPage(request);
if (content.contains("HTTP request failed")) {
for (int i = 1; i <= getRetryNum(); i++) {
content = getPage(request);
if (!content.contains("HTTP request failed")) { if (!content.contains("HTTP request failed")) {
break; page.setDownloadSuccess(true);
}
}
if (content.contains("HTTP request failed")) {
//when failed
Page page = new Page();
page.setRequest(request);
return page;
}
}
Page page = new Page();
page.setRawText(content); page.setRawText(content);
page.setUrl(new PlainText(request.getUrl())); page.setUrl(new PlainText(request.getUrl()));
page.setRequest(request); page.setRequest(request);
page.setStatusCode(200); page.setStatusCode(200);
}
onSuccess(request);
} catch (Exception e) {
onError(request, e);
logger.warn("download page {} error", request.getUrl(), e);
}
return page; return page;
} }
@Override @Override
public void setThread(int threadNum) { public void setThread(int threadNum) {
this.threadNum = threadNum; // ignore
} }
protected String getPage(Request request) { protected String getPage(Request request) throws Exception {
try {
String url = request.getUrl(); String url = request.getUrl();
Runtime runtime = Runtime.getRuntime(); Runtime runtime = Runtime.getRuntime();
Process process = runtime.exec(phantomJsCommand + " " + crawlJsPath + " " + url); Process process = runtime.exec(phantomJsCommand + " " + crawlJsPath + " " + url);
InputStream is = process.getInputStream(); InputStream is = process.getInputStream();
BufferedReader br = new BufferedReader(new InputStreamReader(is)); BufferedReader br = new BufferedReader(new InputStreamReader(is));
StringBuffer stringBuffer = new StringBuffer(); StringBuilder builder = new StringBuilder();
String line; String line;
while ((line = br.readLine()) != null) { while ((line = br.readLine()) != null) {
stringBuffer.append(line).append("\n"); builder.append(line).append("\n");
}
return stringBuffer.toString();
} catch (IOException e) {
e.printStackTrace();
} }
return builder.toString();
return null;
}
public int getRetryNum() {
return retryNum;
}
public PhantomJSDownloader setRetryNum(int retryNum) {
this.retryNum = retryNum;
return this;
} }
} }

@ -11,7 +11,7 @@ import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Request; import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Task; import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.downloader.Downloader; import us.codecraft.webmagic.downloader.AbstractDownloader;
import us.codecraft.webmagic.selector.Html; import us.codecraft.webmagic.selector.Html;
import us.codecraft.webmagic.selector.PlainText; import us.codecraft.webmagic.selector.PlainText;
@ -27,7 +27,7 @@ import java.util.Map;
* Date: 13-7-26 <br> * Date: 13-7-26 <br>
* Time: 1:37 <br> * Time: 1:37 <br>
*/ */
public class SeleniumDownloader implements Downloader, Closeable { public class SeleniumDownloader extends AbstractDownloader implements Closeable {
private volatile WebDriverPool webDriverPool; private volatile WebDriverPool webDriverPool;
@ -73,17 +73,17 @@ public class SeleniumDownloader implements Downloader, Closeable {
@Override @Override
public Page download(Request request, Task task) { public Page download(Request request, Task task) {
checkInit(); checkInit();
WebDriver webDriver; WebDriver webDriver = null;
Page page = Page.fail();
try { try {
webDriver = webDriverPool.get(); webDriver = webDriverPool.get();
} catch (InterruptedException e) {
logger.warn("interrupted", e);
return null;
}
logger.info("downloading page " + request.getUrl()); logger.info("downloading page " + request.getUrl());
webDriver.get(request.getUrl()); webDriver.get(request.getUrl());
try { try {
if (sleepTime > 0) {
Thread.sleep(sleepTime); Thread.sleep(sleepTime);
}
} catch (InterruptedException e) { } catch (InterruptedException e) {
e.printStackTrace(); e.printStackTrace();
} }
@ -106,12 +106,20 @@ public class SeleniumDownloader implements Downloader, Closeable {
WebElement webElement = webDriver.findElement(By.xpath("/html")); WebElement webElement = webDriver.findElement(By.xpath("/html"));
String content = webElement.getAttribute("outerHTML"); String content = webElement.getAttribute("outerHTML");
Page page = new Page(); page.setDownloadSuccess(true);
page.setRawText(content); page.setRawText(content);
page.setHtml(new Html(content, request.getUrl())); page.setHtml(new Html(content, request.getUrl()));
page.setUrl(new PlainText(request.getUrl())); page.setUrl(new PlainText(request.getUrl()));
page.setRequest(request); page.setRequest(request);
onSuccess(request);
} catch (Exception e) {
logger.warn("download page {} error", request.getUrl(), e);
onError(request, e);
} finally {
if (webDriver != null) {
webDriverPool.returnToPool(webDriver); webDriverPool.returnToPool(webDriver);
}
}
return page; return page;
} }

Loading…
Cancel
Save