Revert "Common the downloader status process and pass error information when …"

pull/1083/head
Sutra Zhou 2 years ago committed by GitHub
parent ee5a0585d7
commit acfbd7b883
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -26,7 +26,7 @@ public abstract class AbstractDownloader implements Downloader {
/**
* A simple method to download a url.
*
* @param url url
* @param url url
* @param charset charset
* @return html
*/
@ -38,7 +38,7 @@ public abstract class AbstractDownloader implements Downloader {
protected void onSuccess(Request request) {
}
protected void onError(Request request, Throwable e) {
protected void onError(Request request) {
}
}

@ -87,7 +87,7 @@ public class HttpClientDownloader extends AbstractDownloader {
return page;
} catch (IOException e) {
logger.warn("download page {} error", request.getUrl(), e);
onError(request, e);
onError(request);
return page;
} finally {
if (httpResponse != null) {
@ -110,7 +110,7 @@ public class HttpClientDownloader extends AbstractDownloader {
String contentType = httpResponse.getEntity().getContentType() == null ? "" : httpResponse.getEntity().getContentType().getValue();
Page page = new Page();
page.setBytes(bytes);
if (!request.isBinaryContent()) {
if (!request.isBinaryContent()){
if (charset == null) {
charset = getHtmlCharset(contentType, bytes);
}

@ -16,70 +16,73 @@ import java.io.*;
* @version 0.5.3
*/
public class PhantomJSDownloader extends AbstractDownloader {
private static final Logger logger = LoggerFactory.getLogger(PhantomJSDownloader.class);
private static Logger logger = LoggerFactory.getLogger(PhantomJSDownloader.class);
private static String crawlJsPath;
private static String phantomJsCommand = "phantomjs"; // default
private int retryNum;
private int threadNum;
public PhantomJSDownloader() {
this.initPhantomjsCrawlPath();
}
/**
* phantomjs
* <p>
* example:
* phantomjs.exe windows
* phantomjs --ignore-ssl-errors=yes https
* /usr/local/bin/phantomjs IOException
*
*
* example:
* phantomjs.exe windows
* phantomjs --ignore-ssl-errors=yes https
* /usr/local/bin/phantomjs IOException
*
* @param phantomJsCommand phantomJsCommand
*/
public PhantomJSDownloader(String phantomJsCommand) {
this.initPhantomjsCrawlPath();
PhantomJSDownloader.phantomJsCommand = phantomJsCommand;
}
/**
* crawl.jsjarruntime.exec()phantomjs使jarcrawl.js
* <pre>
* crawl.js start --
*
*
* var system = require('system');
* var url = system.args[1];
*
*
* var page = require('webpage').create();
* page.settings.loadImages = false;
* page.settings.resourceTimeout = 5000;
*
*
* page.open(url, function (status) {
* if (status != 'success') {
* console.log("HTTP request failed!");
* } else {
* console.log(page.content);
* }
*
*
* page.close();
* phantom.exit();
* });
*
*
* -- crawl.js end
* </pre>
* js使
* <p>
*
* example:
* new PhantomJSDownloader("/your/path/phantomjs", "/your/path/crawl.js");
*
* new PhantomJSDownloader("/your/path/phantomjs", "/your/path/crawl.js");
*
* @param phantomJsCommand phantomJsCommand
* @param crawlJsPath crawlJsPath
* @param crawlJsPath crawlJsPath
*/
public PhantomJSDownloader(String phantomJsCommand, String crawlJsPath) {
PhantomJSDownloader.phantomJsCommand = phantomJsCommand;
PhantomJSDownloader.crawlJsPath = crawlJsPath;
PhantomJSDownloader.phantomJsCommand = phantomJsCommand;
PhantomJSDownloader.crawlJsPath = crawlJsPath;
}
private void initPhantomjsCrawlPath() {
PhantomJSDownloader.crawlJsPath = new File(this.getClass().getResource("/").getPath()).getPath()
+ System.getProperty("file.separator") + "crawl.js ";
PhantomJSDownloader.crawlJsPath = new File(this.getClass().getResource("/").getPath()).getPath() + System.getProperty("file.separator") + "crawl.js ";
}
@Override
@ -87,41 +90,61 @@ public class PhantomJSDownloader extends AbstractDownloader {
if (logger.isInfoEnabled()) {
logger.info("downloading page: " + request.getUrl());
}
Page page = Page.fail();
try {
String content = getPage(request);
if (!content.contains("HTTP request failed")) {
page.setDownloadSuccess(true);
page.setRawText(content);
page.setUrl(new PlainText(request.getUrl()));
String content = getPage(request);
if (content.contains("HTTP request failed")) {
for (int i = 1; i <= getRetryNum(); i++) {
content = getPage(request);
if (!content.contains("HTTP request failed")) {
break;
}
}
if (content.contains("HTTP request failed")) {
//when failed
Page page = new Page();
page.setRequest(request);
page.setStatusCode(200);
return page;
}
onSuccess(request);
} catch (Exception e) {
onError(request, e);
logger.warn("download page {} error", request.getUrl(), e);
}
Page page = new Page();
page.setRawText(content);
page.setUrl(new PlainText(request.getUrl()));
page.setRequest(request);
page.setStatusCode(200);
return page;
}
@Override
public void setThread(int threadNum) {
// ignore
this.threadNum = threadNum;
}
protected String getPage(Request request) throws Exception {
String url = request.getUrl();
Runtime runtime = Runtime.getRuntime();
Process process = runtime.exec(phantomJsCommand + " " + crawlJsPath + " " + url);
InputStream is = process.getInputStream();
BufferedReader br = new BufferedReader(new InputStreamReader(is));
StringBuilder builder = new StringBuilder();
String line;
while ((line = br.readLine()) != null) {
builder.append(line).append("\n");
protected String getPage(Request request) {
try {
String url = request.getUrl();
Runtime runtime = Runtime.getRuntime();
Process process = runtime.exec(phantomJsCommand + " " + crawlJsPath + " " + url);
InputStream is = process.getInputStream();
BufferedReader br = new BufferedReader(new InputStreamReader(is));
StringBuffer stringBuffer = new StringBuffer();
String line;
while ((line = br.readLine()) != null) {
stringBuffer.append(line).append("\n");
}
return stringBuffer.toString();
} catch (IOException e) {
e.printStackTrace();
}
return builder.toString();
return null;
}
public int getRetryNum() {
return retryNum;
}
public PhantomJSDownloader setRetryNum(int retryNum) {
this.retryNum = retryNum;
return this;
}
}

@ -11,7 +11,7 @@ import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.downloader.AbstractDownloader;
import us.codecraft.webmagic.downloader.Downloader;
import us.codecraft.webmagic.selector.Html;
import us.codecraft.webmagic.selector.PlainText;
@ -24,120 +24,112 @@ import java.util.Map;
* Selenium driver<br>
*
* @author code4crafter@gmail.com <br>
* Date: 13-7-26 <br>
* Time: 1:37 <br>
* Date: 13-7-26 <br>
* Time: 1:37 <br>
*/
public class SeleniumDownloader extends AbstractDownloader implements Closeable {
private volatile WebDriverPool webDriverPool;
private Logger logger = LoggerFactory.getLogger(getClass());
private int sleepTime = 0;
private int poolSize = 1;
private static final String DRIVER_PHANTOMJS = "phantomjs";
/**
*
*
* @param chromeDriverPath chromeDriverPath
*/
public SeleniumDownloader(String chromeDriverPath) {
System.getProperties().setProperty("webdriver.chrome.driver",
chromeDriverPath);
}
/**
* Constructor without any filed. Construct PhantomJS browser
*
* @author bob.li.0718@gmail.com
*/
public SeleniumDownloader() {
// System.setProperty("phantomjs.binary.path",
// "/Users/Bingo/Downloads/phantomjs-1.9.7-macosx/bin/phantomjs");
}
/**
* set sleep time to wait until load success
*
* @param sleepTime sleepTime
* @return this
*/
public SeleniumDownloader setSleepTime(int sleepTime) {
this.sleepTime = sleepTime;
return this;
}
@Override
public Page download(Request request, Task task) {
checkInit();
WebDriver webDriver = null;
Page page = Page.fail();
try {
webDriver = webDriverPool.get();
logger.info("downloading page " + request.getUrl());
webDriver.get(request.getUrl());
try {
if (sleepTime > 0) {
Thread.sleep(sleepTime);
}
} catch (InterruptedException e) {
e.printStackTrace();
}
WebDriver.Options manage = webDriver.manage();
Site site = task.getSite();
if (site.getCookies() != null) {
for (Map.Entry<String, String> cookieEntry : site.getCookies()
.entrySet()) {
Cookie cookie = new Cookie(cookieEntry.getKey(),
cookieEntry.getValue());
manage.addCookie(cookie);
}
}
/*
* TODO You can add mouse event or other processes
*
* @author: bob.li.0718@gmail.com
*/
WebElement webElement = webDriver.findElement(By.xpath("/html"));
String content = webElement.getAttribute("outerHTML");
page.setDownloadSuccess(true);
page.setRawText(content);
page.setHtml(new Html(content, request.getUrl()));
page.setUrl(new PlainText(request.getUrl()));
page.setRequest(request);
onSuccess(request);
} catch (Exception e) {
logger.warn("download page {} error", request.getUrl(), e);
onError(request, e);
} finally {
if (webDriver != null) {
webDriverPool.returnToPool(webDriver);
}
}
return page;
}
private void checkInit() {
if (webDriverPool == null) {
synchronized (this) {
webDriverPool = new WebDriverPool(poolSize);
}
}
}
@Override
public void setThread(int thread) {
this.poolSize = thread;
}
@Override
public void close() throws IOException {
webDriverPool.closeAll();
}
public class SeleniumDownloader implements Downloader, Closeable {
private volatile WebDriverPool webDriverPool;
private Logger logger = LoggerFactory.getLogger(getClass());
private int sleepTime = 0;
private int poolSize = 1;
private static final String DRIVER_PHANTOMJS = "phantomjs";
/**
*
*
* @param chromeDriverPath chromeDriverPath
*/
public SeleniumDownloader(String chromeDriverPath) {
System.getProperties().setProperty("webdriver.chrome.driver",
chromeDriverPath);
}
/**
* Constructor without any filed. Construct PhantomJS browser
*
* @author bob.li.0718@gmail.com
*/
public SeleniumDownloader() {
// System.setProperty("phantomjs.binary.path",
// "/Users/Bingo/Downloads/phantomjs-1.9.7-macosx/bin/phantomjs");
}
/**
* set sleep time to wait until load success
*
* @param sleepTime sleepTime
* @return this
*/
public SeleniumDownloader setSleepTime(int sleepTime) {
this.sleepTime = sleepTime;
return this;
}
@Override
public Page download(Request request, Task task) {
checkInit();
WebDriver webDriver;
try {
webDriver = webDriverPool.get();
} catch (InterruptedException e) {
logger.warn("interrupted", e);
return null;
}
logger.info("downloading page " + request.getUrl());
webDriver.get(request.getUrl());
try {
Thread.sleep(sleepTime);
} catch (InterruptedException e) {
e.printStackTrace();
}
WebDriver.Options manage = webDriver.manage();
Site site = task.getSite();
if (site.getCookies() != null) {
for (Map.Entry<String, String> cookieEntry : site.getCookies()
.entrySet()) {
Cookie cookie = new Cookie(cookieEntry.getKey(),
cookieEntry.getValue());
manage.addCookie(cookie);
}
}
/*
* TODO You can add mouse event or other processes
*
* @author: bob.li.0718@gmail.com
*/
WebElement webElement = webDriver.findElement(By.xpath("/html"));
String content = webElement.getAttribute("outerHTML");
Page page = new Page();
page.setRawText(content);
page.setHtml(new Html(content, request.getUrl()));
page.setUrl(new PlainText(request.getUrl()));
page.setRequest(request);
webDriverPool.returnToPool(webDriver);
return page;
}
private void checkInit() {
if (webDriverPool == null) {
synchronized (this) {
webDriverPool = new WebDriverPool(poolSize);
}
}
}
@Override
public void setThread(int thread) {
this.poolSize = thread;
}
@Override
public void close() throws IOException {
webDriverPool.closeAll();
}
}

Loading…
Cancel
Save