From e87aabf8fdb76f431d02dd9bd0dbff01973f7856 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Mon, 29 Jul 2013 20:01:44 +0800 Subject: [PATCH] =?UTF-8?q?=E4=B8=BAdownloader=E5=A2=9E=E5=8A=A0=E4=BA=86?= =?UTF-8?q?=E4=B8=80=E4=B8=AA=E6=96=B0=E6=96=B9=E6=B3=95=EF=BC=8C=E5=8F=AF?= =?UTF-8?q?=E8=AE=BE=E7=BD=AE=E7=BA=BF=E7=A8=8B=E6=95=B0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../java/us/codecraft/webmagic/Spider.java | 17 +++++++------ .../webmagic/downloader/Downloader.java | 13 ++++++++-- .../webmagic/downloader/FileDownloader.java | 5 ++++ .../downloader/HttpClientDownloader.java | 13 ++++------ .../downloader/SeleniumDownloader.java | 25 +++++++++++++------ 5 files changed, 48 insertions(+), 25 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java index 2a8b78fb..a25fd024 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java @@ -58,6 +58,8 @@ public class Spider implements Runnable, Task { private ExecutorService executorService; + private int threadNum = 1; + private AtomicInteger stat = new AtomicInteger(STAT_INIT); private final static int STAT_INIT = 0; @@ -144,6 +146,10 @@ public class Spider implements Runnable, Task { if (downloader == null) { this.downloader = new HttpClientDownloader(); } + if (pipelines.isEmpty()) { + pipelines.add(new ConsolePipeline()); + } + downloader.setThread(threadNum); } @Override @@ -158,9 +164,6 @@ public class Spider implements Runnable, Task { } } Request request = scheduler.poll(this); - if (pipelines.isEmpty()) { - pipelines.add(new ConsolePipeline()); - } //singel thread if (executorService == null) { while (request != null) { @@ -211,9 +214,9 @@ public class Spider implements Runnable, Task { } } - private void destroyEach(Object object){ + private void destroyEach(Object object) { if (object instanceof Destroyable) { - ((Destroyable)object).destroy(); + ((Destroyable) object).destroy(); } } @@ -267,12 +270,10 @@ public class Spider implements Runnable, Task { */ public Spider thread(int threadNum) { checkIfNotRunning(); + this.threadNum = threadNum; if (threadNum <= 0) { throw new IllegalArgumentException("threadNum should be more than one!"); } - if (downloader==null || downloader instanceof HttpClientDownloader){ - downloader = new HttpClientDownloader(threadNum); - } if (threadNum == 1) { return this; } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/Downloader.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/Downloader.java index c431fc3b..9a7f59a3 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/Downloader.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/Downloader.java @@ -6,9 +6,10 @@ import us.codecraft.webmagic.Task; /** * Downloader是webmagic下载页面的接口。webmagic默认使用了HttpComponent作为下载器,一般情况,你无需自己实现这个接口。
+ * * @author code4crafter@gmail.com
- * Date: 13-4-21 - * Time: 下午12:14 + * Date: 13-4-21 + * Time: 下午12:14 */ public interface Downloader { @@ -20,4 +21,12 @@ public interface Downloader { * @return page */ public Page download(Request request, Task task); + + /** + * 设置线程数,多线程程序一般需要Downloader支持
+ * 如果不考虑多线程的可以不实现这个方法
+ * + * @param thread 线程数量 + */ + public void setThread(int thread); } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/FileDownloader.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/FileDownloader.java index d22bf081..722a2eb7 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/FileDownloader.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/FileDownloader.java @@ -67,6 +67,11 @@ public class FileDownloader implements Downloader { return page; } + @Override + public void setThread(int thread) { + + } + private String getHtml(BufferedReader bufferedReader) throws IOException { String line; StringBuilder htmlBuilder= new StringBuilder(); diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java index d7634198..7956cd1e 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java @@ -32,14 +32,6 @@ public class HttpClientDownloader implements Downloader { private int poolSize; - public HttpClientDownloader(int poolSize) { - this.poolSize = poolSize; - } - - public HttpClientDownloader() { - this(5); - } - @Override public Page download(Request request, Task task) { Site site = task.getSite(); @@ -90,6 +82,11 @@ public class HttpClientDownloader implements Downloader { return null; } + @Override + public void setThread(int thread) { + poolSize=thread; + } + private void handleGzip(HttpResponse httpResponse) { Header ceheader = httpResponse.getEntity().getContentEncoding(); if (ceheader != null) { diff --git a/webmagic-plugin/webmagic-selenium/src/main/java/us/codecraft/webmagic/selenium/downloader/SeleniumDownloader.java b/webmagic-plugin/webmagic-selenium/src/main/java/us/codecraft/webmagic/selenium/downloader/SeleniumDownloader.java index 54e3c9c0..76ac0508 100644 --- a/webmagic-plugin/webmagic-selenium/src/main/java/us/codecraft/webmagic/selenium/downloader/SeleniumDownloader.java +++ b/webmagic-plugin/webmagic-selenium/src/main/java/us/codecraft/webmagic/selenium/downloader/SeleniumDownloader.java @@ -27,12 +27,14 @@ import java.util.Map; */ public class SeleniumDownloader implements Downloader, Destroyable { - private WebDriverPool webDriverPool; + private volatile WebDriverPool webDriverPool; private Logger logger = Logger.getLogger(getClass()); private int sleepTime = 0; + private int poolSize = 1; + /** * 新建 * @@ -40,16 +42,11 @@ public class SeleniumDownloader implements Downloader, Destroyable { */ public SeleniumDownloader(String chromeDriverPath) { System.getProperties().setProperty("webdriver.chrome.driver", chromeDriverPath); - webDriverPool = new WebDriverPool(); - } - - public SeleniumDownloader(String chromeDriverPath, int poolSize) { - System.getProperties().setProperty("webdriver.chrome.driver", chromeDriverPath); - webDriverPool = new WebDriverPool(poolSize); } /** * set sleep time to wait until load success + * * @param sleepTime * @return this */ @@ -60,6 +57,7 @@ public class SeleniumDownloader implements Downloader, Destroyable { @Override public Page download(Request request, Task task) { + checkInit(); WebDriver webDriver; try { webDriver = webDriverPool.get(); @@ -93,6 +91,19 @@ public class SeleniumDownloader implements Downloader, Destroyable { return page; } + private void checkInit() { + if (webDriverPool == null) { + synchronized (this){ + webDriverPool = new WebDriverPool(poolSize); + } + } + } + + @Override + public void setThread(int thread) { + this.poolSize = thread; + } + @Override public void destroy() { webDriverPool.closeAll();