From fcb09f2e08e8ee2e17d4eb3c4cf48200277ab7a7 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Fri, 26 Jul 2013 14:29:19 +0800 Subject: [PATCH] invite selenium --- .../downloader/SeleniumDownloader.java | 48 +++++++++++ .../selenium/downloader/WebDriverPool.java | 82 +++++++++++++++++++ .../webmagic/selenium/SeleniumTest.java | 29 +++++++ .../downloader/WebDriverPoolTest.java | 28 +++++++ 4 files changed, 187 insertions(+) create mode 100644 webmagic-plugin/webmagic-selenium/src/main/java/us/codecraft/webmagic/selenium/downloader/SeleniumDownloader.java create mode 100644 webmagic-plugin/webmagic-selenium/src/main/java/us/codecraft/webmagic/selenium/downloader/WebDriverPool.java create mode 100644 webmagic-plugin/webmagic-selenium/src/test/java/us/codecraft/webmagic/selenium/SeleniumTest.java create mode 100644 webmagic-plugin/webmagic-selenium/src/test/java/us/codecraft/webmagic/selenium/downloader/WebDriverPoolTest.java diff --git a/webmagic-plugin/webmagic-selenium/src/main/java/us/codecraft/webmagic/selenium/downloader/SeleniumDownloader.java b/webmagic-plugin/webmagic-selenium/src/main/java/us/codecraft/webmagic/selenium/downloader/SeleniumDownloader.java new file mode 100644 index 00000000..8fd1c6a2 --- /dev/null +++ b/webmagic-plugin/webmagic-selenium/src/main/java/us/codecraft/webmagic/selenium/downloader/SeleniumDownloader.java @@ -0,0 +1,48 @@ +package us.codecraft.webmagic.selenium.downloader; + +import org.apache.log4j.Logger; +import org.openqa.selenium.By; +import org.openqa.selenium.WebDriver; +import org.openqa.selenium.WebElement; +import us.codecraft.webmagic.Page; +import us.codecraft.webmagic.Request; +import us.codecraft.webmagic.Task; +import us.codecraft.webmagic.downloader.Downloader; +import us.codecraft.webmagic.selector.Html; +import us.codecraft.webmagic.selector.PlainText; +import us.codecraft.webmagic.utils.UrlUtils; + +/** + * @author yihua.huang@dianping.com
+ * @date: 13-7-26
+ * Time: 下午1:37
+ */ +public class SeleniumDownloader implements Downloader { + + private WebDriverPool webDriverPool; + + private Logger logger = Logger.getLogger(getClass()); + + public SeleniumDownloader(String chromeDriverPath) { + System.getProperties().setProperty("webdriver.chrome.driver", chromeDriverPath); + } + + @Override + public Page download(Request request, Task task) { + WebDriver webDriver = null; + try { + webDriver = webDriverPool.get(); + } catch (InterruptedException e) { + logger.warn("interrupted",e); + return null; + } + webDriver.get(request.getUrl()); + WebElement webElement = webDriver.findElement(By.xpath("/html")); + String content = webElement.getAttribute("outerHTML"); + Page page = new Page(); + page.setHtml(new Html(UrlUtils.fixAllRelativeHrefs(content, request.getUrl()))); + page.setUrl(new PlainText(request.getUrl())); + page.setRequest(request); + return page; + } +} diff --git a/webmagic-plugin/webmagic-selenium/src/main/java/us/codecraft/webmagic/selenium/downloader/WebDriverPool.java b/webmagic-plugin/webmagic-selenium/src/main/java/us/codecraft/webmagic/selenium/downloader/WebDriverPool.java new file mode 100644 index 00000000..039cef98 --- /dev/null +++ b/webmagic-plugin/webmagic-selenium/src/main/java/us/codecraft/webmagic/selenium/downloader/WebDriverPool.java @@ -0,0 +1,82 @@ +package us.codecraft.webmagic.selenium.downloader; + +import org.openqa.selenium.WebDriver; +import org.openqa.selenium.chrome.ChromeDriver; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; +import java.util.concurrent.BlockingDeque; +import java.util.concurrent.LinkedBlockingDeque; +import java.util.concurrent.atomic.AtomicInteger; + +/** + * @author yihua.huang@dianping.com
+ * @date: 13-7-26
+ * Time: 下午1:41
+ */ +class WebDriverPool { + + private final static int DEFAULT_CAPACITY = 5; + + private final int capacity; + + private final static int STAT_RUNNING = 1; + + private final static int STAT_CLODED = 2; + + private AtomicInteger stat = new AtomicInteger(STAT_RUNNING); + + private List webDriverList = Collections.synchronizedList(new ArrayList()); + + public WebDriverPool(int capacity) { + this.capacity = capacity; + } + + public WebDriverPool() { + this(DEFAULT_CAPACITY); + } + + private BlockingDeque innerQueue = new LinkedBlockingDeque(); + + public WebDriver get() throws InterruptedException { + checkRunning(); + WebDriver poll = innerQueue.poll(); + if (poll != null) { + return poll; + } + if (webDriverList.size() < capacity) { + synchronized (webDriverList) { + if (webDriverList.size() < capacity) { + ChromeDriver e = new ChromeDriver(); + innerQueue.add(e); + webDriverList.add(e); + } + } + + } + return innerQueue.take(); + } + + public void returnToPool(WebDriver webDriver) { + checkRunning(); + innerQueue.add(webDriver); + } + + protected void checkRunning() { + if (!stat.compareAndSet(STAT_RUNNING, STAT_RUNNING)) { + throw new IllegalStateException("Already closed!"); + } + } + + public void closeAll() { + boolean b = stat.compareAndSet(STAT_RUNNING, STAT_CLODED); + if (!b) { + throw new IllegalStateException("Already closed!"); + } + for (WebDriver webDriver : webDriverList) { + webDriver.close(); + } + + } +} diff --git a/webmagic-plugin/webmagic-selenium/src/test/java/us/codecraft/webmagic/selenium/SeleniumTest.java b/webmagic-plugin/webmagic-selenium/src/test/java/us/codecraft/webmagic/selenium/SeleniumTest.java new file mode 100644 index 00000000..fc0a9ec6 --- /dev/null +++ b/webmagic-plugin/webmagic-selenium/src/test/java/us/codecraft/webmagic/selenium/SeleniumTest.java @@ -0,0 +1,29 @@ +package us.codecraft.webmagic.selenium; + +import org.junit.Test; +import org.openqa.selenium.By; +import org.openqa.selenium.WebDriver; +import org.openqa.selenium.WebElement; +import org.openqa.selenium.chrome.ChromeDriver; + +import java.util.List; + +/** + * @author yihua.huang@dianping.com
+ * @date: 13-7-26
+ * Time: 下午12:27
+ */ +public class SeleniumTest { + + @Test + public void test(){ + System.getProperties().setProperty("webdriver.chrome.driver","/Users/yihua/Downloads/chromedriver"); + WebDriver webDriver = new ChromeDriver(); + webDriver.get("http://huaban.com/"); + List elements = webDriver.findElements(By.xpath("/html")); + for (WebElement element : elements) { + System.out.println(element.getAttribute("outerHTML")); + } + webDriver.close(); + } +} diff --git a/webmagic-plugin/webmagic-selenium/src/test/java/us/codecraft/webmagic/selenium/downloader/WebDriverPoolTest.java b/webmagic-plugin/webmagic-selenium/src/test/java/us/codecraft/webmagic/selenium/downloader/WebDriverPoolTest.java new file mode 100644 index 00000000..d38216f8 --- /dev/null +++ b/webmagic-plugin/webmagic-selenium/src/test/java/us/codecraft/webmagic/selenium/downloader/WebDriverPoolTest.java @@ -0,0 +1,28 @@ +package us.codecraft.webmagic.selenium.downloader; + +import org.junit.Test; +import org.openqa.selenium.WebDriver; + +/** + * @author yihua.huang@dianping.com
+ * @date: 13-7-26
+ * Time: 下午2:12
+ */ +public class WebDriverPoolTest { + + @Test + public void test(){ + String chromeDriverPath = "/Users/yihua/Downloads/chromedriver"; + System.getProperties().setProperty("webdriver.chrome.driver", chromeDriverPath); + WebDriverPool webDriverPool =new WebDriverPool(5); + for (int i=0;i<5;i++){ + try { + WebDriver webDriver = webDriverPool.get(); + System.out.println(i); + } catch (InterruptedException e) { + e.printStackTrace(); + } + } + webDriverPool.closeAll(); + } +}