|
|
@ -16,73 +16,70 @@ import java.io.*;
|
|
|
|
* @version 0.5.3
|
|
|
|
* @version 0.5.3
|
|
|
|
*/
|
|
|
|
*/
|
|
|
|
public class PhantomJSDownloader extends AbstractDownloader {
|
|
|
|
public class PhantomJSDownloader extends AbstractDownloader {
|
|
|
|
|
|
|
|
private static final Logger logger = LoggerFactory.getLogger(PhantomJSDownloader.class);
|
|
|
|
private static Logger logger = LoggerFactory.getLogger(PhantomJSDownloader.class);
|
|
|
|
|
|
|
|
private static String crawlJsPath;
|
|
|
|
private static String crawlJsPath;
|
|
|
|
private static String phantomJsCommand = "phantomjs"; // default
|
|
|
|
private static String phantomJsCommand = "phantomjs"; // default
|
|
|
|
|
|
|
|
|
|
|
|
private int retryNum;
|
|
|
|
|
|
|
|
private int threadNum;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
public PhantomJSDownloader() {
|
|
|
|
public PhantomJSDownloader() {
|
|
|
|
this.initPhantomjsCrawlPath();
|
|
|
|
this.initPhantomjsCrawlPath();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
/**
|
|
|
|
* 添加新的构造函数,支持phantomjs自定义命令
|
|
|
|
* 添加新的构造函数,支持phantomjs自定义命令
|
|
|
|
*
|
|
|
|
* <p>
|
|
|
|
* example:
|
|
|
|
* example:
|
|
|
|
* phantomjs.exe 支持windows环境
|
|
|
|
* phantomjs.exe 支持windows环境
|
|
|
|
* phantomjs --ignore-ssl-errors=yes 忽略抓取地址是https时的一些错误
|
|
|
|
* phantomjs --ignore-ssl-errors=yes 忽略抓取地址是https时的一些错误
|
|
|
|
* /usr/local/bin/phantomjs 命令的绝对路径,避免因系统环境变量引起的IOException
|
|
|
|
* /usr/local/bin/phantomjs 命令的绝对路径,避免因系统环境变量引起的IOException
|
|
|
|
*
|
|
|
|
*
|
|
|
|
* @param phantomJsCommand phantomJsCommand
|
|
|
|
* @param phantomJsCommand phantomJsCommand
|
|
|
|
*/
|
|
|
|
*/
|
|
|
|
public PhantomJSDownloader(String phantomJsCommand) {
|
|
|
|
public PhantomJSDownloader(String phantomJsCommand) {
|
|
|
|
this.initPhantomjsCrawlPath();
|
|
|
|
this.initPhantomjsCrawlPath();
|
|
|
|
PhantomJSDownloader.phantomJsCommand = phantomJsCommand;
|
|
|
|
PhantomJSDownloader.phantomJsCommand = phantomJsCommand;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
/**
|
|
|
|
* 新增构造函数,支持crawl.js路径自定义,因为当其他项目依赖此jar包时,runtime.exec()执行phantomjs命令时无使用法jar包中的crawl.js
|
|
|
|
* 新增构造函数,支持crawl.js路径自定义,因为当其他项目依赖此jar包时,runtime.exec()执行phantomjs命令时无使用法jar包中的crawl.js
|
|
|
|
* <pre>
|
|
|
|
* <pre>
|
|
|
|
* crawl.js start --
|
|
|
|
* crawl.js start --
|
|
|
|
*
|
|
|
|
*
|
|
|
|
* var system = require('system');
|
|
|
|
* var system = require('system');
|
|
|
|
* var url = system.args[1];
|
|
|
|
* var url = system.args[1];
|
|
|
|
*
|
|
|
|
*
|
|
|
|
* var page = require('webpage').create();
|
|
|
|
* var page = require('webpage').create();
|
|
|
|
* page.settings.loadImages = false;
|
|
|
|
* page.settings.loadImages = false;
|
|
|
|
* page.settings.resourceTimeout = 5000;
|
|
|
|
* page.settings.resourceTimeout = 5000;
|
|
|
|
*
|
|
|
|
*
|
|
|
|
* page.open(url, function (status) {
|
|
|
|
* page.open(url, function (status) {
|
|
|
|
* if (status != 'success') {
|
|
|
|
* if (status != 'success') {
|
|
|
|
* console.log("HTTP request failed!");
|
|
|
|
* console.log("HTTP request failed!");
|
|
|
|
* } else {
|
|
|
|
* } else {
|
|
|
|
* console.log(page.content);
|
|
|
|
* console.log(page.content);
|
|
|
|
* }
|
|
|
|
* }
|
|
|
|
*
|
|
|
|
*
|
|
|
|
* page.close();
|
|
|
|
* page.close();
|
|
|
|
* phantom.exit();
|
|
|
|
* phantom.exit();
|
|
|
|
* });
|
|
|
|
* });
|
|
|
|
*
|
|
|
|
*
|
|
|
|
* -- crawl.js end
|
|
|
|
* -- crawl.js end
|
|
|
|
* </pre>
|
|
|
|
* </pre>
|
|
|
|
* 具体项目时可以将以上js代码复制下来使用
|
|
|
|
* 具体项目时可以将以上js代码复制下来使用
|
|
|
|
*
|
|
|
|
* <p>
|
|
|
|
* example:
|
|
|
|
* example:
|
|
|
|
* new PhantomJSDownloader("/your/path/phantomjs", "/your/path/crawl.js");
|
|
|
|
* new PhantomJSDownloader("/your/path/phantomjs", "/your/path/crawl.js");
|
|
|
|
*
|
|
|
|
*
|
|
|
|
* @param phantomJsCommand phantomJsCommand
|
|
|
|
* @param phantomJsCommand phantomJsCommand
|
|
|
|
* @param crawlJsPath crawlJsPath
|
|
|
|
* @param crawlJsPath crawlJsPath
|
|
|
|
*/
|
|
|
|
*/
|
|
|
|
public PhantomJSDownloader(String phantomJsCommand, String crawlJsPath) {
|
|
|
|
public PhantomJSDownloader(String phantomJsCommand, String crawlJsPath) {
|
|
|
|
PhantomJSDownloader.phantomJsCommand = phantomJsCommand;
|
|
|
|
PhantomJSDownloader.phantomJsCommand = phantomJsCommand;
|
|
|
|
PhantomJSDownloader.crawlJsPath = crawlJsPath;
|
|
|
|
PhantomJSDownloader.crawlJsPath = crawlJsPath;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
private void initPhantomjsCrawlPath() {
|
|
|
|
private void initPhantomjsCrawlPath() {
|
|
|
|
PhantomJSDownloader.crawlJsPath = new File(this.getClass().getResource("/").getPath()).getPath() + System.getProperty("file.separator") + "crawl.js ";
|
|
|
|
PhantomJSDownloader.crawlJsPath = new File(this.getClass().getResource("/").getPath()).getPath()
|
|
|
|
|
|
|
|
+ System.getProperty("file.separator") + "crawl.js ";
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
@Override
|
|
|
|
@Override
|
|
|
@ -90,61 +87,41 @@ public class PhantomJSDownloader extends AbstractDownloader {
|
|
|
|
if (logger.isInfoEnabled()) {
|
|
|
|
if (logger.isInfoEnabled()) {
|
|
|
|
logger.info("downloading page: " + request.getUrl());
|
|
|
|
logger.info("downloading page: " + request.getUrl());
|
|
|
|
}
|
|
|
|
}
|
|
|
|
String content = getPage(request);
|
|
|
|
|
|
|
|
if (content.contains("HTTP request failed")) {
|
|
|
|
Page page = Page.fail();
|
|
|
|
for (int i = 1; i <= getRetryNum(); i++) {
|
|
|
|
try {
|
|
|
|
content = getPage(request);
|
|
|
|
String content = getPage(request);
|
|
|
|
if (!content.contains("HTTP request failed")) {
|
|
|
|
if (!content.contains("HTTP request failed")) {
|
|
|
|
break;
|
|
|
|
page.setDownloadSuccess(true);
|
|
|
|
}
|
|
|
|
page.setRawText(content);
|
|
|
|
}
|
|
|
|
page.setUrl(new PlainText(request.getUrl()));
|
|
|
|
if (content.contains("HTTP request failed")) {
|
|
|
|
|
|
|
|
//when failed
|
|
|
|
|
|
|
|
Page page = new Page();
|
|
|
|
|
|
|
|
page.setRequest(request);
|
|
|
|
page.setRequest(request);
|
|
|
|
return page;
|
|
|
|
page.setStatusCode(200);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
onSuccess(request);
|
|
|
|
|
|
|
|
} catch (Exception e) {
|
|
|
|
|
|
|
|
onError(request, e);
|
|
|
|
|
|
|
|
logger.warn("download page {} error", request.getUrl(), e);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
Page page = new Page();
|
|
|
|
|
|
|
|
page.setRawText(content);
|
|
|
|
|
|
|
|
page.setUrl(new PlainText(request.getUrl()));
|
|
|
|
|
|
|
|
page.setRequest(request);
|
|
|
|
|
|
|
|
page.setStatusCode(200);
|
|
|
|
|
|
|
|
return page;
|
|
|
|
return page;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
@Override
|
|
|
|
@Override
|
|
|
|
public void setThread(int threadNum) {
|
|
|
|
public void setThread(int threadNum) {
|
|
|
|
this.threadNum = threadNum;
|
|
|
|
// ignore
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
protected String getPage(Request request) {
|
|
|
|
protected String getPage(Request request) throws Exception {
|
|
|
|
try {
|
|
|
|
String url = request.getUrl();
|
|
|
|
String url = request.getUrl();
|
|
|
|
Runtime runtime = Runtime.getRuntime();
|
|
|
|
Runtime runtime = Runtime.getRuntime();
|
|
|
|
Process process = runtime.exec(phantomJsCommand + " " + crawlJsPath + " " + url);
|
|
|
|
Process process = runtime.exec(phantomJsCommand + " " + crawlJsPath + " " + url);
|
|
|
|
InputStream is = process.getInputStream();
|
|
|
|
InputStream is = process.getInputStream();
|
|
|
|
BufferedReader br = new BufferedReader(new InputStreamReader(is));
|
|
|
|
BufferedReader br = new BufferedReader(new InputStreamReader(is));
|
|
|
|
StringBuilder builder = new StringBuilder();
|
|
|
|
StringBuffer stringBuffer = new StringBuffer();
|
|
|
|
String line;
|
|
|
|
String line;
|
|
|
|
while ((line = br.readLine()) != null) {
|
|
|
|
while ((line = br.readLine()) != null) {
|
|
|
|
builder.append(line).append("\n");
|
|
|
|
stringBuffer.append(line).append("\n");
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
return stringBuffer.toString();
|
|
|
|
|
|
|
|
} catch (IOException e) {
|
|
|
|
|
|
|
|
e.printStackTrace();
|
|
|
|
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return builder.toString();
|
|
|
|
return null;
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
public int getRetryNum() {
|
|
|
|
|
|
|
|
return retryNum;
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
public PhantomJSDownloader setRetryNum(int retryNum) {
|
|
|
|
|
|
|
|
this.retryNum = retryNum;
|
|
|
|
|
|
|
|
return this;
|
|
|
|
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|