From b4fcf4116830c332992665e218551db64cd215b4 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Thu, 31 Oct 2013 22:41:02 +0800 Subject: [PATCH] add exit when comlete option --- .../java/us/codecraft/webmagic/Spider.java | 51 ++++++++++++++++--- .../example/OschinaBlogPageProcesser.java | 2 +- 2 files changed, 46 insertions(+), 7 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java index 65ee7af8..1c4160d7 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java @@ -18,6 +18,8 @@ import java.util.ArrayList; import java.util.List; import java.util.concurrent.ExecutorService; import java.util.concurrent.atomic.AtomicInteger; +import java.util.concurrent.locks.Condition; +import java.util.concurrent.locks.ReentrantLock; /** * Entrance of a crawler.
@@ -74,7 +76,7 @@ public class Spider implements Runnable, Task { protected AtomicInteger stat = new AtomicInteger(STAT_INIT); - protected boolean exitWhenComplete = false; + protected boolean exitWhenComplete = true; protected final static int STAT_INIT = 0; @@ -82,6 +84,10 @@ public class Spider implements Runnable, Task { protected final static int STAT_STOPPED = 2; + private ReentrantLock newUrlLock = new ReentrantLock(); + + private Condition newUrlCondition = newUrlLock.newCondition(); + /** * create a spider with pageProcessor. * @@ -245,11 +251,15 @@ public class Spider implements Runnable, Task { if (threadAlive.get() == 0 && exitWhenComplete) { break; } - // when no request found but some thread is alive, sleep a - // while. + // wait until new url added try { - Thread.sleep(100); - } catch (InterruptedException e) { + newUrlLock.lock(); + try { + newUrlCondition.await(); + } catch (InterruptedException e) { + } + } finally { + newUrlLock.unlock(); } } else { final Request requestFinal = request; @@ -263,6 +273,7 @@ public class Spider implements Runnable, Task { logger.error("download " + requestFinal + " error", e); } finally { threadAlive.decrementAndGet(); + signalNewUrl(); } } }); @@ -351,11 +362,16 @@ public class Spider implements Runnable, Task { protected void addRequest(Page page) { if (CollectionUtils.isNotEmpty(page.getTargetRequests())) { for (Request request : page.getTargetRequests()) { - scheduler.push(request, this); + addRequest(request); } } } + private void addRequest(Request request) { + scheduler.push(request, this); + + } + protected void checkIfRunning() { if (stat.get() == STAT_RUNNING) { throw new IllegalStateException("Spider is already running!"); @@ -368,6 +384,29 @@ public class Spider implements Runnable, Task { thread.start(); } + /** + * Add urls to crawl.
+ * + * @param urls + * @return + */ + public Spider addUrl(String... urls) { + for (String url : urls) { + addRequest(new Request(url)); + } + signalNewUrl(); + return this; + } + + private void signalNewUrl() { + try { + newUrlLock.lock(); + newUrlCondition.signalAll(); + } finally { + newUrlLock.unlock(); + } + } + public void start() { runAsync(); } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/processor/example/OschinaBlogPageProcesser.java b/webmagic-core/src/main/java/us/codecraft/webmagic/processor/example/OschinaBlogPageProcesser.java index fa8dab6d..2c53b2d0 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/processor/example/OschinaBlogPageProcesser.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/processor/example/OschinaBlogPageProcesser.java @@ -34,6 +34,6 @@ public class OschinaBlogPageProcesser implements PageProcessor { } public static void main(String[] args) { - Spider.create(new OschinaBlogPageProcesser()).thread(2).run(); + Spider.create(new OschinaBlogPageProcesser()).thread(10).run(); } }