diff --git a/webmagic-selenium/config.ini b/webmagic-selenium/config.ini new file mode 100644 index 00000000..78abbc5a --- /dev/null +++ b/webmagic-selenium/config.ini @@ -0,0 +1,12 @@ +# What WebDriver to use for the tests +driver=phantomjs +#driver=firefox +#driver=chrome +#driver=http://localhost:8910 +#driver=http://localhost:4444/wd/hub + +# PhantomJS specific config (change according to your installation) +#phantomjs_exec_path=/Users/Bingo/bin/phantomjs-qt5 +phantomjs_exec_path=/Users/Bingo/Downloads/phantomjs-1.9.8-macosx/bin/phantomjs +#phantomjs_driver_path=/Users/Bingo/Documents/workspace/webmagic/webmagic-selenium/src/main.js +phantomjs_driver_loglevel=DEBUG \ No newline at end of file diff --git a/webmagic-selenium/pom.xml b/webmagic-selenium/pom.xml index 29fe64cd..8fbd8664 100644 --- a/webmagic-selenium/pom.xml +++ b/webmagic-selenium/pom.xml @@ -1,40 +1,49 @@ - - - webmagic-parent - us.codecraft - 0.5.2 - - 4.0.0 + + + webmagic-parent + us.codecraft + 0.5.2 + + 4.0.0 - webmagic-selenium + webmagic-selenium - - - org.seleniumhq.selenium - selenium-java - 2.33.0 - - - us.codecraft - webmagic-core - ${project.version} - - - junit - junit - - + + + org.seleniumhq.selenium + selenium-java + 2.46.0 + + + us.codecraft + webmagic-core + ${project.version} + + + com.github.detro + phantomjsdriver + 1.2.0 + - - - - maven-deploy-plugin - - true - - - - + + + + junit + junit + + + + + + + maven-deploy-plugin + + true + + + + \ No newline at end of file diff --git a/webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/SeleniumDownloader.java b/webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/SeleniumDownloader.java index 50d332ba..ad1fb494 100644 --- a/webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/SeleniumDownloader.java +++ b/webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/SeleniumDownloader.java @@ -5,6 +5,7 @@ import org.openqa.selenium.By; import org.openqa.selenium.Cookie; import org.openqa.selenium.WebDriver; import org.openqa.selenium.WebElement; + import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Request; import us.codecraft.webmagic.Site; @@ -23,90 +24,113 @@ import java.util.Map; * 需要下载Selenium driver支持。
* * @author code4crafter@gmail.com
- * Date: 13-7-26
- * Time: 下午1:37
+ * Date: 13-7-26
+ * Time: 下午1:37
*/ public class SeleniumDownloader implements Downloader, Closeable { - private volatile WebDriverPool webDriverPool; - - private Logger logger = Logger.getLogger(getClass()); - - private int sleepTime = 0; - - private int poolSize = 1; - - /** - * 新建 - * - * @param chromeDriverPath - */ - public SeleniumDownloader(String chromeDriverPath) { - System.getProperties().setProperty("webdriver.chrome.driver", chromeDriverPath); - } - - /** - * set sleep time to wait until load success - * - * @param sleepTime - * @return this - */ - public SeleniumDownloader setSleepTime(int sleepTime) { - this.sleepTime = sleepTime; - return this; - } - - @Override - public Page download(Request request, Task task) { - checkInit(); - WebDriver webDriver; - try { - webDriver = webDriverPool.get(); - } catch (InterruptedException e) { - logger.warn("interrupted", e); - return null; - } - logger.info("downloading page " + request.getUrl()); - webDriver.get(request.getUrl()); - try { - Thread.sleep(sleepTime); - } catch (InterruptedException e) { - e.printStackTrace(); - } - WebDriver.Options manage = webDriver.manage(); - Site site = task.getSite(); - if (site.getCookies() != null) { - for (Map.Entry cookieEntry : site.getCookies().entrySet()) { - Cookie cookie = new Cookie(cookieEntry.getKey(), cookieEntry.getValue()); - manage.addCookie(cookie); - } - } - WebElement webElement = webDriver.findElement(By.xpath("/html")); - String content = webElement.getAttribute("outerHTML"); - Page page = new Page(); - page.setRawText(content); - page.setHtml(new Html(UrlUtils.fixAllRelativeHrefs(content, request.getUrl()))); - page.setUrl(new PlainText(request.getUrl())); - page.setRequest(request); - webDriverPool.returnToPool(webDriver); - return page; - } - - private void checkInit() { - if (webDriverPool == null) { - synchronized (this){ - webDriverPool = new WebDriverPool(poolSize); - } - } - } - - @Override - public void setThread(int thread) { - this.poolSize = thread; - } - - @Override - public void close() throws IOException { - webDriverPool.closeAll(); - } + private volatile WebDriverPool webDriverPool; + + private Logger logger = Logger.getLogger(getClass()); + + private int sleepTime = 0; + + private int poolSize = 1; + + private static final String DRIVER_PHANTOMJS = "phantomjs"; + + /** + * 新建 + * + * @param chromeDriverPath + */ + public SeleniumDownloader(String chromeDriverPath) { + System.getProperties().setProperty("webdriver.chrome.driver", + chromeDriverPath); + } + + /** + * Constructor without any filed. Construct PhantomJS browser + * + * @author bob.li.0718@gmail.com + */ + public SeleniumDownloader() { + // System.setProperty("phantomjs.binary.path", + // "/Users/Bingo/Downloads/phantomjs-1.9.7-macosx/bin/phantomjs"); + } + + /** + * set sleep time to wait until load success + * + * @param sleepTime + * @return this + */ + public SeleniumDownloader setSleepTime(int sleepTime) { + this.sleepTime = sleepTime; + return this; + } + + @Override + public Page download(Request request, Task task) { + checkInit(); + WebDriver webDriver; + try { + webDriver = webDriverPool.get(); + } catch (InterruptedException e) { + logger.warn("interrupted", e); + return null; + } + logger.info("downloading page " + request.getUrl()); + webDriver.get(request.getUrl()); + try { + Thread.sleep(sleepTime); + } catch (InterruptedException e) { + e.printStackTrace(); + } + WebDriver.Options manage = webDriver.manage(); + Site site = task.getSite(); + if (site.getCookies() != null) { + for (Map.Entry cookieEntry : site.getCookies() + .entrySet()) { + Cookie cookie = new Cookie(cookieEntry.getKey(), + cookieEntry.getValue()); + manage.addCookie(cookie); + } + } + + /* + * TODO You can add mouse event or other processes + * + * @author: bob.li.0718@gmail.com + */ + + WebElement webElement = webDriver.findElement(By.xpath("/html")); + String content = webElement.getAttribute("outerHTML"); + Page page = new Page(); + page.setRawText(content); + page.setHtml(new Html(UrlUtils.fixAllRelativeHrefs(content, + request.getUrl()))); + page.setUrl(new PlainText(request.getUrl())); + page.setRequest(request); + webDriverPool.returnToPool(webDriver); + return page; + } + + private void checkInit() { + if (webDriverPool == null) { + synchronized (this) { + webDriverPool = new WebDriverPool(poolSize); + } + } + } + + @Override + public void setThread(int thread) { + this.poolSize = thread; + } + + @Override + public void close() throws IOException { + webDriverPool.closeAll(); + } } diff --git a/webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/WebDriverPool.java b/webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/WebDriverPool.java index f628ede9..4a93332b 100644 --- a/webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/WebDriverPool.java +++ b/webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/WebDriverPool.java @@ -3,89 +3,231 @@ package us.codecraft.webmagic.downloader.selenium; import org.apache.log4j.Logger; import org.openqa.selenium.WebDriver; import org.openqa.selenium.chrome.ChromeDriver; - +import org.openqa.selenium.firefox.FirefoxDriver; +import org.openqa.selenium.phantomjs.PhantomJSDriver; +import org.openqa.selenium.phantomjs.PhantomJSDriverService; +import org.openqa.selenium.remote.DesiredCapabilities; +import org.openqa.selenium.remote.RemoteWebDriver; + +import java.io.FileReader; +import java.io.IOException; +import java.net.MalformedURLException; +import java.net.URL; import java.util.ArrayList; import java.util.Collections; import java.util.List; +import java.util.Properties; import java.util.concurrent.BlockingDeque; import java.util.concurrent.LinkedBlockingDeque; import java.util.concurrent.atomic.AtomicInteger; /** * @author code4crafter@gmail.com
- * Date: 13-7-26
- * Time: 下午1:41
+ * Date: 13-7-26
+ * Time: 下午1:41
*/ class WebDriverPool { - private Logger logger = Logger.getLogger(getClass()); - - private final static int DEFAULT_CAPACITY = 5; - - private final int capacity; - - private final static int STAT_RUNNING = 1; - - private final static int STAT_CLODED = 2; - - private AtomicInteger stat = new AtomicInteger(STAT_RUNNING); - - /** - * store webDrivers created - */ - private List webDriverList = Collections.synchronizedList(new ArrayList()); - - /** - * store webDrivers available - */ - private BlockingDeque innerQueue = new LinkedBlockingDeque(); - - public WebDriverPool(int capacity) { - this.capacity = capacity; - } - - public WebDriverPool() { - this(DEFAULT_CAPACITY); - } - - public WebDriver get() throws InterruptedException { - checkRunning(); - WebDriver poll = innerQueue.poll(); - if (poll != null) { - return poll; - } - if (webDriverList.size() < capacity) { - synchronized (webDriverList) { - if (webDriverList.size() < capacity) { - ChromeDriver e = new ChromeDriver(); - innerQueue.add(e); - webDriverList.add(e); - } - } - - } - return innerQueue.take(); - } - - public void returnToPool(WebDriver webDriver) { - checkRunning(); - innerQueue.add(webDriver); - } - - protected void checkRunning() { - if (!stat.compareAndSet(STAT_RUNNING, STAT_RUNNING)) { - throw new IllegalStateException("Already closed!"); - } - } - - public void closeAll() { - boolean b = stat.compareAndSet(STAT_RUNNING, STAT_CLODED); - if (!b) { - throw new IllegalStateException("Already closed!"); - } - for (WebDriver webDriver : webDriverList) { - logger.info("Quit webDriver" + webDriver); - webDriver.quit(); - } - } + private Logger logger = Logger.getLogger(getClass()); + + private final static int DEFAULT_CAPACITY = 5; + + private final int capacity; + + private final static int STAT_RUNNING = 1; + + private final static int STAT_CLODED = 2; + + private AtomicInteger stat = new AtomicInteger(STAT_RUNNING); + + /* + * new fields for configuring phantomJS + */ + private WebDriver mDriver = null; + private boolean mAutoQuitDriver = true; + + private static final String CONFIG_FILE = "/Users/Bingo/Documents/workspace/webmagic/webmagic-selenium/config.ini"; + private static final String DRIVER_FIREFOX = "firefox"; + private static final String DRIVER_CHROME = "chrome"; + private static final String DRIVER_PHANTOMJS = "phantomjs"; + + protected static Properties sConfig; + protected static DesiredCapabilities sCaps; + + /** + * Configure the GhostDriver, and initialize a WebDriver instance. This part + * of code comes from GhostDriver. + * https://github.com/detro/ghostdriver/tree/master/test/java/src/test/java/ghostdriver + * + * @author bob.li.0718@gmail.com + * @throws IOException + */ + public void configure() throws IOException { + // Read config file + sConfig = new Properties(); + sConfig.load(new FileReader(CONFIG_FILE)); + + // Prepare capabilities + sCaps = new DesiredCapabilities(); + sCaps.setJavascriptEnabled(true); + sCaps.setCapability("takesScreenshot", false); + + String driver = sConfig.getProperty("driver", DRIVER_PHANTOMJS); + + // Fetch PhantomJS-specific configuration parameters + if (driver.equals(DRIVER_PHANTOMJS)) { + // "phantomjs_exec_path" + if (sConfig.getProperty("phantomjs_exec_path") != null) { + sCaps.setCapability( + PhantomJSDriverService.PHANTOMJS_EXECUTABLE_PATH_PROPERTY, + sConfig.getProperty("phantomjs_exec_path")); + } else { + throw new IOException( + String.format( + "Property '%s' not set!", + PhantomJSDriverService.PHANTOMJS_EXECUTABLE_PATH_PROPERTY)); + } + // "phantomjs_driver_path" + if (sConfig.getProperty("phantomjs_driver_path") != null) { + System.out.println("Test will use an external GhostDriver"); + sCaps.setCapability( + PhantomJSDriverService.PHANTOMJS_GHOSTDRIVER_PATH_PROPERTY, + sConfig.getProperty("phantomjs_driver_path")); + } else { + System.out + .println("Test will use PhantomJS internal GhostDriver"); + } + } + + // Disable "web-security", enable all possible "ssl-protocols" and + // "ignore-ssl-errors" for PhantomJSDriver + // sCaps.setCapability(PhantomJSDriverService.PHANTOMJS_CLI_ARGS, new + // String[] { + // "--web-security=false", + // "--ssl-protocol=any", + // "--ignore-ssl-errors=true" + // }); + + ArrayList cliArgsCap = new ArrayList(); + cliArgsCap.add("--web-security=false"); + cliArgsCap.add("--ssl-protocol=any"); + cliArgsCap.add("--ignore-ssl-errors=true"); + sCaps.setCapability(PhantomJSDriverService.PHANTOMJS_CLI_ARGS, + cliArgsCap); + + // Control LogLevel for GhostDriver, via CLI arguments + sCaps.setCapability( + PhantomJSDriverService.PHANTOMJS_GHOSTDRIVER_CLI_ARGS, + new String[] { "--logLevel=" + + (sConfig.getProperty("phantomjs_driver_loglevel") != null ? sConfig + .getProperty("phantomjs_driver_loglevel") + : "INFO") }); + + // String driver = sConfig.getProperty("driver", DRIVER_PHANTOMJS); + + // Start appropriate Driver + if (isUrl(driver)) { + sCaps.setBrowserName("phantomjs"); + mDriver = new RemoteWebDriver(new URL(driver), sCaps); + } else if (driver.equals(DRIVER_FIREFOX)) { + mDriver = new FirefoxDriver(sCaps); + } else if (driver.equals(DRIVER_CHROME)) { + mDriver = new ChromeDriver(sCaps); + } else if (driver.equals(DRIVER_PHANTOMJS)) { + mDriver = new PhantomJSDriver(sCaps); + } + } + + /** + * check whether input is a valid URL + * + * @author bob.li.0718@gmail.com + * @param urlString + * @return true means yes, otherwise no. + */ + private boolean isUrl(String urlString) { + try { + new URL(urlString); + return true; + } catch (MalformedURLException mue) { + return false; + } + } + + /** + * store webDrivers created + */ + private List webDriverList = Collections + .synchronizedList(new ArrayList()); + + /** + * store webDrivers available + */ + private BlockingDeque innerQueue = new LinkedBlockingDeque(); + + public WebDriverPool(int capacity) { + this.capacity = capacity; + } + + public WebDriverPool() { + this(DEFAULT_CAPACITY); + } + + /** + * + * @return + * @throws InterruptedException + */ + public WebDriver get() throws InterruptedException { + checkRunning(); + WebDriver poll = innerQueue.poll(); + if (poll != null) { + return poll; + } + if (webDriverList.size() < capacity) { + synchronized (webDriverList) { + if (webDriverList.size() < capacity) { + + // add new WebDriver instance into pool + try { + configure(); + innerQueue.add(mDriver); + webDriverList.add(mDriver); + } catch (IOException e) { + e.printStackTrace(); + } + + // ChromeDriver e = new ChromeDriver(); + // WebDriver e = getWebDriver(); + // innerQueue.add(e); + // webDriverList.add(e); + } + } + + } + return innerQueue.take(); + } + + public void returnToPool(WebDriver webDriver) { + checkRunning(); + innerQueue.add(webDriver); + } + + protected void checkRunning() { + if (!stat.compareAndSet(STAT_RUNNING, STAT_RUNNING)) { + throw new IllegalStateException("Already closed!"); + } + } + + public void closeAll() { + boolean b = stat.compareAndSet(STAT_RUNNING, STAT_CLODED); + if (!b) { + throw new IllegalStateException("Already closed!"); + } + for (WebDriver webDriver : webDriverList) { + logger.info("Quit webDriver" + webDriver); + webDriver.quit(); + webDriver = null; + } + } } diff --git a/webmagic-selenium/src/test/java/us/codecraft/webmagic/samples/GooglePlayProcessor.java b/webmagic-selenium/src/test/java/us/codecraft/webmagic/samples/GooglePlayProcessor.java new file mode 100644 index 00000000..3bab3955 --- /dev/null +++ b/webmagic-selenium/src/test/java/us/codecraft/webmagic/samples/GooglePlayProcessor.java @@ -0,0 +1,46 @@ +package us.codecraft.webmagic.samples; + +import us.codecraft.webmagic.Page; +import us.codecraft.webmagic.Site; +import us.codecraft.webmagic.Spider; +import us.codecraft.webmagic.downloader.selenium.SeleniumDownloader; +import us.codecraft.webmagic.pipeline.FilePipeline; +import us.codecraft.webmagic.processor.PageProcessor; + +/** + * + * Using Selenium with PhantomJS to fetch web-page with JS
+ * + * @author bob.li.0718@gmail.com
+ * Date: 15-7-11
+ */ +public class GooglePlayProcessor implements PageProcessor { + + private Site site; + + @Override + public void process(Page page) { + + page.putField("whole-html", page.getHtml().toString()); + + } + + @Override + public Site getSite() { + if (null == site) { + site = Site.me().setDomain("play.google.com").setSleepTime(300); + } + return site; + } + + public static void main(String[] args) { + Spider.create(new GooglePlayProcessor()) + .thread(5) + .addPipeline( + new FilePipeline( + "/Users/Bingo/Documents/workspace/webmagic/webmagic-selenium/data/")) + .setDownloader(new SeleniumDownloader()) + .addUrl("https://play.google.com/store/apps/details?id=com.tencent.mm") + .runAsync(); + } +}