diff --git a/webmagic-plugin/pom.xml b/webmagic-plugin/pom.xml index 6eb7d615..634f09d3 100644 --- a/webmagic-plugin/pom.xml +++ b/webmagic-plugin/pom.xml @@ -25,6 +25,11 @@ freemarker 2.3.15 + + redis.clients + jedis + 2.0.0 + diff --git a/webmagic-plugin/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java b/webmagic-plugin/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java new file mode 100644 index 00000000..e87ee335 --- /dev/null +++ b/webmagic-plugin/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java @@ -0,0 +1,45 @@ +package us.codecraft.webmagic.scheduler; + +import redis.clients.jedis.Jedis; +import redis.clients.jedis.JedisPool; +import redis.clients.jedis.JedisPoolConfig; +import us.codecraft.webmagic.Request; +import us.codecraft.webmagic.Task; +import us.codecraft.webmagic.schedular.Scheduler; + +/** + * 使用redis管理url,构建一个分布式的爬虫。
+ * @author yihua.huang@dianping.com
+ * @date: 13-7-25
+ * Time: 上午7:07
+ */ +public class RedisScheduler implements Scheduler{ + + private JedisPool pool; + + private static final String QUEUE_PREFIX = "queue_"; + + private static final String SET_PREFIX = "set_"; + + public RedisScheduler(String host){ + pool = new JedisPool(new JedisPoolConfig(), host); + } + + @Override + public synchronized void push(Request request, Task task) { + Jedis jedis = pool.getResource(); + if (jedis.zrank(SET_PREFIX+task.getUUID(),request.getUrl())==null){ + jedis.rpush(QUEUE_PREFIX+task.getUUID(),request.getUrl()); + jedis.zadd(SET_PREFIX+task.getUUID(),System.currentTimeMillis(),request.getUrl()); + } + pool.returnResource(jedis); + } + + @Override + public synchronized Request poll(Task task) { + Jedis jedis = pool.getResource(); + String url = jedis.lpop(QUEUE_PREFIX+task.getUUID()); + pool.returnResource(jedis); + return new Request(url); + } +} diff --git a/webmagic-plugin/src/test/java/us/codecraft/webmagic/scheduler/RedisSchedulerTest.java b/webmagic-plugin/src/test/java/us/codecraft/webmagic/scheduler/RedisSchedulerTest.java new file mode 100644 index 00000000..0f556d28 --- /dev/null +++ b/webmagic-plugin/src/test/java/us/codecraft/webmagic/scheduler/RedisSchedulerTest.java @@ -0,0 +1,41 @@ +package us.codecraft.webmagic.scheduler; + +import org.junit.Before; +import org.junit.Test; +import us.codecraft.webmagic.Request; +import us.codecraft.webmagic.Site; +import us.codecraft.webmagic.Task; + +/** + * @author yihua.huang@dianping.com
+ * @date: 13-7-25
+ * Time: 上午7:51
+ */ +public class RedisSchedulerTest { + + private RedisScheduler redisScheduler; + + @Before + public void setUp() { + redisScheduler = new RedisScheduler("localhost"); + } + + @Test + public void test() { + Task task = new Task() { + @Override + public String getUUID() { + return "1"; + } + + @Override + public Site getSite() { + return null; + } + }; + redisScheduler.push(new Request("http://www.ibm.com/developerworks/cn/java/j-javadev2-22/"), task); + Request poll = redisScheduler.poll(task); + System.out.println(poll.getUrl()); + + } +} diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/GlobalProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/GlobalProcessor.java index 383422f6..f7c5f7fa 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/GlobalProcessor.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/GlobalProcessor.java @@ -3,11 +3,9 @@ package us.codecraft.webmagic.samples; import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Spider; -import us.codecraft.webmagic.downloader.FileDownloader; -import us.codecraft.webmagic.downloader.HttpClientDownloader; import us.codecraft.webmagic.pipeline.FilePipeline; import us.codecraft.webmagic.processor.PageProcessor; -import us.codecraft.webmagic.schedular.FileCacheQueueScheduler; +import us.codecraft.webmagic.scheduler.RedisScheduler; import java.util.List; @@ -40,9 +38,12 @@ public class GlobalProcessor implements PageProcessor { public static void main(String[] args) { Spider.create(new GlobalProcessor()).thread(10) - .scheduler(new FileCacheQueueScheduler("/data/webmagic/test")) - .downloader(new FileDownloader("/data/webmagic/test", new HttpClientDownloader())) - .pipeline(new FilePipeline("/data/webmagic/test")) + .scheduler(new RedisScheduler("localhost")) + .pipeline(new FilePipeline("/data/webmagic/test/")) + .runAsync(); + Spider.create(new GlobalProcessor()).thread(10) + .scheduler(new RedisScheduler("localhost")) + .pipeline(new FilePipeline("/data/webmagic/test/")) .run(); } }