|
|
|
@ -11,7 +11,7 @@ import us.codecraft.webmagic.Page;
|
|
|
|
|
import us.codecraft.webmagic.Request;
|
|
|
|
|
import us.codecraft.webmagic.Site;
|
|
|
|
|
import us.codecraft.webmagic.Task;
|
|
|
|
|
import us.codecraft.webmagic.downloader.Downloader;
|
|
|
|
|
import us.codecraft.webmagic.downloader.AbstractDownloader;
|
|
|
|
|
import us.codecraft.webmagic.selector.Html;
|
|
|
|
|
import us.codecraft.webmagic.selector.PlainText;
|
|
|
|
|
|
|
|
|
@ -24,112 +24,120 @@ import java.util.Map;
|
|
|
|
|
* 需要下载Selenium driver支持。<br>
|
|
|
|
|
*
|
|
|
|
|
* @author code4crafter@gmail.com <br>
|
|
|
|
|
* Date: 13-7-26 <br>
|
|
|
|
|
* Time: 下午1:37 <br>
|
|
|
|
|
* Date: 13-7-26 <br>
|
|
|
|
|
* Time: 下午1:37 <br>
|
|
|
|
|
*/
|
|
|
|
|
public class SeleniumDownloader implements Downloader, Closeable {
|
|
|
|
|
|
|
|
|
|
private volatile WebDriverPool webDriverPool;
|
|
|
|
|
|
|
|
|
|
private Logger logger = LoggerFactory.getLogger(getClass());
|
|
|
|
|
|
|
|
|
|
private int sleepTime = 0;
|
|
|
|
|
|
|
|
|
|
private int poolSize = 1;
|
|
|
|
|
|
|
|
|
|
private static final String DRIVER_PHANTOMJS = "phantomjs";
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* 新建
|
|
|
|
|
*
|
|
|
|
|
* @param chromeDriverPath chromeDriverPath
|
|
|
|
|
*/
|
|
|
|
|
public SeleniumDownloader(String chromeDriverPath) {
|
|
|
|
|
System.getProperties().setProperty("webdriver.chrome.driver",
|
|
|
|
|
chromeDriverPath);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Constructor without any filed. Construct PhantomJS browser
|
|
|
|
|
*
|
|
|
|
|
* @author bob.li.0718@gmail.com
|
|
|
|
|
*/
|
|
|
|
|
public SeleniumDownloader() {
|
|
|
|
|
// System.setProperty("phantomjs.binary.path",
|
|
|
|
|
// "/Users/Bingo/Downloads/phantomjs-1.9.7-macosx/bin/phantomjs");
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* set sleep time to wait until load success
|
|
|
|
|
*
|
|
|
|
|
* @param sleepTime sleepTime
|
|
|
|
|
* @return this
|
|
|
|
|
*/
|
|
|
|
|
public SeleniumDownloader setSleepTime(int sleepTime) {
|
|
|
|
|
this.sleepTime = sleepTime;
|
|
|
|
|
return this;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
@Override
|
|
|
|
|
public Page download(Request request, Task task) {
|
|
|
|
|
checkInit();
|
|
|
|
|
WebDriver webDriver;
|
|
|
|
|
try {
|
|
|
|
|
webDriver = webDriverPool.get();
|
|
|
|
|
} catch (InterruptedException e) {
|
|
|
|
|
logger.warn("interrupted", e);
|
|
|
|
|
return null;
|
|
|
|
|
}
|
|
|
|
|
logger.info("downloading page " + request.getUrl());
|
|
|
|
|
webDriver.get(request.getUrl());
|
|
|
|
|
try {
|
|
|
|
|
Thread.sleep(sleepTime);
|
|
|
|
|
} catch (InterruptedException e) {
|
|
|
|
|
e.printStackTrace();
|
|
|
|
|
}
|
|
|
|
|
WebDriver.Options manage = webDriver.manage();
|
|
|
|
|
Site site = task.getSite();
|
|
|
|
|
if (site.getCookies() != null) {
|
|
|
|
|
for (Map.Entry<String, String> cookieEntry : site.getCookies()
|
|
|
|
|
.entrySet()) {
|
|
|
|
|
Cookie cookie = new Cookie(cookieEntry.getKey(),
|
|
|
|
|
cookieEntry.getValue());
|
|
|
|
|
manage.addCookie(cookie);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* TODO You can add mouse event or other processes
|
|
|
|
|
*
|
|
|
|
|
* @author: bob.li.0718@gmail.com
|
|
|
|
|
*/
|
|
|
|
|
|
|
|
|
|
WebElement webElement = webDriver.findElement(By.xpath("/html"));
|
|
|
|
|
String content = webElement.getAttribute("outerHTML");
|
|
|
|
|
Page page = new Page();
|
|
|
|
|
page.setRawText(content);
|
|
|
|
|
page.setHtml(new Html(content, request.getUrl()));
|
|
|
|
|
page.setUrl(new PlainText(request.getUrl()));
|
|
|
|
|
page.setRequest(request);
|
|
|
|
|
webDriverPool.returnToPool(webDriver);
|
|
|
|
|
return page;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
private void checkInit() {
|
|
|
|
|
if (webDriverPool == null) {
|
|
|
|
|
synchronized (this) {
|
|
|
|
|
webDriverPool = new WebDriverPool(poolSize);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
@Override
|
|
|
|
|
public void setThread(int thread) {
|
|
|
|
|
this.poolSize = thread;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
@Override
|
|
|
|
|
public void close() throws IOException {
|
|
|
|
|
webDriverPool.closeAll();
|
|
|
|
|
}
|
|
|
|
|
public class SeleniumDownloader extends AbstractDownloader implements Closeable {
|
|
|
|
|
|
|
|
|
|
private volatile WebDriverPool webDriverPool;
|
|
|
|
|
|
|
|
|
|
private Logger logger = LoggerFactory.getLogger(getClass());
|
|
|
|
|
|
|
|
|
|
private int sleepTime = 0;
|
|
|
|
|
|
|
|
|
|
private int poolSize = 1;
|
|
|
|
|
|
|
|
|
|
private static final String DRIVER_PHANTOMJS = "phantomjs";
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* 新建
|
|
|
|
|
*
|
|
|
|
|
* @param chromeDriverPath chromeDriverPath
|
|
|
|
|
*/
|
|
|
|
|
public SeleniumDownloader(String chromeDriverPath) {
|
|
|
|
|
System.getProperties().setProperty("webdriver.chrome.driver",
|
|
|
|
|
chromeDriverPath);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Constructor without any filed. Construct PhantomJS browser
|
|
|
|
|
*
|
|
|
|
|
* @author bob.li.0718@gmail.com
|
|
|
|
|
*/
|
|
|
|
|
public SeleniumDownloader() {
|
|
|
|
|
// System.setProperty("phantomjs.binary.path",
|
|
|
|
|
// "/Users/Bingo/Downloads/phantomjs-1.9.7-macosx/bin/phantomjs");
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* set sleep time to wait until load success
|
|
|
|
|
*
|
|
|
|
|
* @param sleepTime sleepTime
|
|
|
|
|
* @return this
|
|
|
|
|
*/
|
|
|
|
|
public SeleniumDownloader setSleepTime(int sleepTime) {
|
|
|
|
|
this.sleepTime = sleepTime;
|
|
|
|
|
return this;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
@Override
|
|
|
|
|
public Page download(Request request, Task task) {
|
|
|
|
|
checkInit();
|
|
|
|
|
WebDriver webDriver = null;
|
|
|
|
|
Page page = Page.fail();
|
|
|
|
|
try {
|
|
|
|
|
webDriver = webDriverPool.get();
|
|
|
|
|
|
|
|
|
|
logger.info("downloading page " + request.getUrl());
|
|
|
|
|
webDriver.get(request.getUrl());
|
|
|
|
|
try {
|
|
|
|
|
if (sleepTime > 0) {
|
|
|
|
|
Thread.sleep(sleepTime);
|
|
|
|
|
}
|
|
|
|
|
} catch (InterruptedException e) {
|
|
|
|
|
e.printStackTrace();
|
|
|
|
|
}
|
|
|
|
|
WebDriver.Options manage = webDriver.manage();
|
|
|
|
|
Site site = task.getSite();
|
|
|
|
|
if (site.getCookies() != null) {
|
|
|
|
|
for (Map.Entry<String, String> cookieEntry : site.getCookies()
|
|
|
|
|
.entrySet()) {
|
|
|
|
|
Cookie cookie = new Cookie(cookieEntry.getKey(),
|
|
|
|
|
cookieEntry.getValue());
|
|
|
|
|
manage.addCookie(cookie);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* TODO You can add mouse event or other processes
|
|
|
|
|
*
|
|
|
|
|
* @author: bob.li.0718@gmail.com
|
|
|
|
|
*/
|
|
|
|
|
|
|
|
|
|
WebElement webElement = webDriver.findElement(By.xpath("/html"));
|
|
|
|
|
String content = webElement.getAttribute("outerHTML");
|
|
|
|
|
page.setDownloadSuccess(true);
|
|
|
|
|
page.setRawText(content);
|
|
|
|
|
page.setHtml(new Html(content, request.getUrl()));
|
|
|
|
|
page.setUrl(new PlainText(request.getUrl()));
|
|
|
|
|
page.setRequest(request);
|
|
|
|
|
onSuccess(request);
|
|
|
|
|
} catch (Exception e) {
|
|
|
|
|
logger.warn("download page {} error", request.getUrl(), e);
|
|
|
|
|
onError(request, e);
|
|
|
|
|
} finally {
|
|
|
|
|
if (webDriver != null) {
|
|
|
|
|
webDriverPool.returnToPool(webDriver);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
return page;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
private void checkInit() {
|
|
|
|
|
if (webDriverPool == null) {
|
|
|
|
|
synchronized (this) {
|
|
|
|
|
webDriverPool = new WebDriverPool(poolSize);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
@Override
|
|
|
|
|
public void setThread(int thread) {
|
|
|
|
|
this.poolSize = thread;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
@Override
|
|
|
|
|
public void close() throws IOException {
|
|
|
|
|
webDriverPool.closeAll();
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|