diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java index eb2c1321..75cb7b8a 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java @@ -148,4 +148,15 @@ public class Page { public ResultItems getResultItems() { return resultItems; } + + @Override + public String toString() { + return "Page{" + + "request=" + request + + ", resultItems=" + resultItems + + ", html=" + html + + ", url=" + url + + ", targetRequests=" + targetRequests + + '}'; + } } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java index 905dbe59..2109e72a 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java @@ -113,4 +113,13 @@ public class Request implements Serializable { public void setUrl(String url) { this.url = url; } + + @Override + public String toString() { + return "Request{" + + "url='" + url + '\'' + + ", extras=" + extras + + ", priority=" + priority + + '}'; + } } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java index 32653d6c..e23a8e70 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java @@ -40,33 +40,33 @@ import java.util.concurrent.atomic.AtomicInteger; */ public class Spider implements Runnable, Task { - private Downloader downloader; + protected Downloader downloader; - private List pipelines = new ArrayList(); + protected List pipelines = new ArrayList(); - private PageProcessor pageProcessor; + protected PageProcessor pageProcessor; - private List startUrls; + protected List startUrls; - private Site site; + protected Site site; - private String uuid; + protected String uuid; - private Scheduler scheduler = new QueueScheduler(); + protected Scheduler scheduler = new QueueScheduler(); - private Logger logger = Logger.getLogger(getClass()); + protected Logger logger = Logger.getLogger(getClass()); - private ExecutorService executorService; + protected ExecutorService executorService; - private int threadNum = 1; + protected int threadNum = 1; - private AtomicInteger stat = new AtomicInteger(STAT_INIT); + protected AtomicInteger stat = new AtomicInteger(STAT_INIT); - private final static int STAT_INIT = 0; + protected final static int STAT_INIT = 0; - private final static int STAT_RUNNING = 1; + protected final static int STAT_RUNNING = 1; - private final static int STAT_STOPPED = 2; + protected final static int STAT_STOPPED = 2; /** * 使用已定义的抽取规则新建一个Spider。 @@ -206,7 +206,7 @@ public class Spider implements Runnable, Task { destroy(); } - private void destroy() { + protected void destroy() { destroyEach(downloader); destroyEach(pageProcessor); for (Pipeline pipeline : pipelines) { @@ -233,7 +233,7 @@ public class Spider implements Runnable, Task { } } - private void processRequest(Request request) { + protected void processRequest(Request request) { Page page = downloader.download(request, this); if (page == null) { sleep(site.getSleepTime()); @@ -249,7 +249,7 @@ public class Spider implements Runnable, Task { sleep(site.getSleepTime()); } - private void sleep(int time) { + protected void sleep(int time) { try { Thread.sleep(time); } catch (InterruptedException e) { @@ -257,7 +257,7 @@ public class Spider implements Runnable, Task { } } - private void addRequest(Page page) { + protected void addRequest(Page page) { if (CollectionUtils.isNotEmpty(page.getTargetRequests())) { for (Request request : page.getTargetRequests()) { scheduler.push(request, this); @@ -265,7 +265,7 @@ public class Spider implements Runnable, Task { } } - private void checkIfNotRunning() { + protected void checkIfNotRunning() { if (!stat.compareAndSet(STAT_INIT, STAT_INIT)) { throw new IllegalStateException("Spider is already running!"); } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java index 0e0977ae..dd805c6f 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java @@ -66,13 +66,7 @@ public class HttpClientDownloader implements Downloader { } // handleGzip(httpResponse); - String content = IOUtils.toString(httpResponse.getEntity().getContent(), - charset); - Page page = new Page(); - page.setHtml(new Html(UrlUtils.fixAllRelativeHrefs(content, request.getUrl()))); - page.setUrl(new PlainText(request.getUrl())); - page.setRequest(request); - return page; + return handleResponse(request, charset, httpResponse,task); } else { logger.warn("code error " + statusCode + "\t" + request.getUrl()); } @@ -82,6 +76,16 @@ public class HttpClientDownloader implements Downloader { return null; } + protected Page handleResponse(Request request, String charset, HttpResponse httpResponse,Task task) throws IOException { + String content = IOUtils.toString(httpResponse.getEntity().getContent(), + charset); + Page page = new Page(); + page.setHtml(new Html(UrlUtils.fixAllRelativeHrefs(content, request.getUrl()))); + page.setUrl(new PlainText(request.getUrl())); + page.setRequest(request); + return page; + } + @Override public void setThread(int thread) { poolSize=thread; diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/FilePipeline.java b/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/FilePipeline.java index 252ccd5f..9c88ba9e 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/FilePipeline.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/FilePipeline.java @@ -4,8 +4,8 @@ import org.apache.commons.codec.digest.DigestUtils; import org.apache.log4j.Logger; import us.codecraft.webmagic.ResultItems; import us.codecraft.webmagic.Task; +import us.codecraft.webmagic.utils.FilePersistentBase; -import java.io.File; import java.io.FileWriter; import java.io.IOException; import java.io.PrintWriter; @@ -18,9 +18,7 @@ import java.util.Map; * Date: 13-4-21 * Time: 下午6:28 */ -public class FilePipeline implements Pipeline { - - private String path = "/data/webmagic/"; +public class FilePipeline extends FilePersistentBase implements Pipeline { private Logger logger = Logger.getLogger(getClass()); @@ -28,7 +26,7 @@ public class FilePipeline implements Pipeline { * 新建一个FilePipeline,使用默认保存路径"/data/webmagic/" */ public FilePipeline() { - + setPath("/data/webmagic/"); } /** @@ -37,21 +35,14 @@ public class FilePipeline implements Pipeline { * @param path 文件保存路径 */ public FilePipeline(String path) { - if (!path.endsWith("/")&&!path.endsWith("\\")){ - path+="/"; - } - this.path = path; + setPath(path); } @Override public void process(ResultItems resultItems, Task task) { - String path = this.path + "/" + task.getUUID() + "/"; - File file = new File(path); - if (!file.exists()) { - file.mkdirs(); - } + String path = this.path + PATH_SEPERATOR + task.getUUID() + PATH_SEPERATOR; try { - PrintWriter printWriter = new PrintWriter(new FileWriter(path + DigestUtils.md5Hex(resultItems.getRequest().getUrl()) + ".html")); + PrintWriter printWriter = new PrintWriter(new FileWriter(getFile(path + DigestUtils.md5Hex(resultItems.getRequest().getUrl()) + ".html"))); printWriter.println("url:\t" + resultItems.getRequest().getUrl()); for (Map.Entry entry : resultItems.getAll().entrySet()) { if (entry.getValue() instanceof Iterable) { diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/utils/FilePersistentBase.java b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/FilePersistentBase.java new file mode 100644 index 00000000..509a71fb --- /dev/null +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/FilePersistentBase.java @@ -0,0 +1,51 @@ +package us.codecraft.webmagic.utils; + +import java.io.File; + +/** + * 文件持久化的基础类。
+ * + * @author code4crafter@gmail.com
+ * Date: 13-8-11
+ * Time: 下午4:21
+ */ +public class FilePersistentBase { + + protected String path; + + public static String PATH_SEPERATOR = "/"; + + static { + String property = System.getProperties().getProperty("file.separator"); + if (property != null) { + PATH_SEPERATOR = property; + } + } + + public void setPath(String path) { + this.path = path; + if (!path.endsWith(PATH_SEPERATOR)) { + path += PATH_SEPERATOR; + } + } + + public File getFile(String fullName) { + checkAndMakeParentDirecotry(fullName); + return new File(fullName); + } + + public void checkAndMakeParentDirecotry(String fullName) { + int index = fullName.lastIndexOf(PATH_SEPERATOR); + if (index > 0) { + String path = fullName.substring(0, index); + File file = new File(path); + if (!file.exists()) { + file.mkdirs(); + } + } + } + + public String getPath() { + return path; + } +} diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/OOSpider.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/OOSpider.java index e5a41e1d..977dcde8 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/OOSpider.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/OOSpider.java @@ -2,6 +2,7 @@ package us.codecraft.webmagic.model; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Spider; +import us.codecraft.webmagic.processor.PageProcessor; /** * 基于Model的Spider,封装后的入口类。
@@ -20,6 +21,10 @@ public class OOSpider extends Spider { this.modelPageProcessor = modelPageProcessor; } + public OOSpider(PageProcessor pageProcessor) { + super(pageProcessor); + } + /** * 创建一个爬虫。
* @param site diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/JsonFilePageModelPipeline.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/JsonFilePageModelPipeline.java index a6b73ccf..c66f52b6 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/JsonFilePageModelPipeline.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/JsonFilePageModelPipeline.java @@ -7,8 +7,8 @@ import org.apache.log4j.Logger; import us.codecraft.webmagic.Task; import us.codecraft.webmagic.model.HasKey; import us.codecraft.webmagic.model.PageModelPipeline; +import us.codecraft.webmagic.utils.FilePersistentBase; -import java.io.File; import java.io.FileWriter; import java.io.IOException; import java.io.PrintWriter; @@ -21,38 +21,29 @@ import java.io.PrintWriter; * Date: 13-4-21 * Time: 下午6:28 */ -public class JsonFilePageModelPipeline implements PageModelPipeline { - - private String path = "/data/webmagic/"; +public class JsonFilePageModelPipeline extends FilePersistentBase implements PageModelPipeline { private Logger logger = Logger.getLogger(getClass()); /** - * 新建一个FilePipeline,使用默认保存路径"/data/webmagic/" + * 新建一个JsonFilePageModelPipeline,使用默认保存路径"/data/webmagic/" */ public JsonFilePageModelPipeline() { - + setPath("/data/webmagic/"); } /** - * 新建一个FilePipeline + * 新建一个JsonFilePageModelPipeline * * @param path 文件保存路径 */ public JsonFilePageModelPipeline(String path) { - if (!path.endsWith("/") && !path.endsWith("\\")) { - path += "/"; - } - this.path = path; + setPath(path); } @Override public void process(Object o, Task task) { String path = this.path + "/" + task.getUUID() + "/"; - File file = new File(path); - if (!file.exists()) { - file.mkdirs(); - } try { String filename; if (o instanceof HasKey) { @@ -60,7 +51,7 @@ public class JsonFilePageModelPipeline implements PageModelPipeline { } else { filename = path + DigestUtils.md5Hex(ToStringBuilder.reflectionToString(o)) + ".json"; } - PrintWriter printWriter = new PrintWriter(new FileWriter(filename)); + PrintWriter printWriter = new PrintWriter(new FileWriter(getFile(filename))); printWriter.write(JSON.toJSONString(o)); printWriter.close(); } catch (IOException e) { diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/JsonFilePipeline.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/JsonFilePipeline.java index 53dba9e4..f2478f01 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/JsonFilePipeline.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/JsonFilePipeline.java @@ -5,6 +5,7 @@ import org.apache.commons.codec.digest.DigestUtils; import org.apache.log4j.Logger; import us.codecraft.webmagic.ResultItems; import us.codecraft.webmagic.Task; +import us.codecraft.webmagic.utils.FilePersistentBase; import java.io.File; import java.io.FileWriter; @@ -18,40 +19,31 @@ import java.io.PrintWriter; * Date: 13-4-21 * Time: 下午6:28 */ -public class JsonFilePipeline implements Pipeline { - - private String path = "/data/webmagic/"; +public class JsonFilePipeline extends FilePersistentBase implements Pipeline { private Logger logger = Logger.getLogger(getClass()); /** - * 新建一个FilePipeline,使用默认保存路径"/data/webmagic/" + * 新建一个JsonFilePipeline,使用默认保存路径"/data/webmagic/" */ public JsonFilePipeline() { - + setPath("/data/webmagic"); } /** - * 新建一个FilePipeline + * 新建一个JsonFilePipeline * * @param path 文件保存路径 */ public JsonFilePipeline(String path) { - if (!path.endsWith("/")&&!path.endsWith("\\")){ - path+="/"; - } - this.path = path; + setPath(path); } @Override public void process(ResultItems resultItems, Task task) { String path = this.path + "/" + task.getUUID() + "/"; - File file = new File(path); - if (!file.exists()) { - file.mkdirs(); - } try { - PrintWriter printWriter = new PrintWriter(new FileWriter(path + DigestUtils.md5Hex(resultItems.getRequest().getUrl()) + ".json")); + PrintWriter printWriter = new PrintWriter(new FileWriter(new File(path + DigestUtils.md5Hex(resultItems.getRequest().getUrl()) + ".json"))); printWriter.write(JSON.toJSONString(resultItems.getAll())); printWriter.close(); } catch (IOException e) { diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java index e7c5bcd4..d6d63492 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java @@ -12,8 +12,8 @@ import us.codecraft.webmagic.Task; * 使用redis管理url,构建一个分布式的爬虫。
* * @author code4crafter@gmail.com
- * Date: 13-7-25
- * Time: 上午7:07
+ * Date: 13-7-25
+ * Time: 上午7:07
*/ public class RedisScheduler implements Scheduler { @@ -32,34 +32,42 @@ public class RedisScheduler implements Scheduler { @Override public synchronized void push(Request request, Task task) { Jedis jedis = pool.getResource(); - //使用SortedSet进行url去重 - if (jedis.zrank(SET_PREFIX + task.getUUID(), request.getUrl()) == null) { - //使用List保存队列 - jedis.rpush(QUEUE_PREFIX + task.getUUID(), request.getUrl()); - jedis.zadd(SET_PREFIX + task.getUUID(), request.getPriority(), request.getUrl()); - if (request.getExtras() != null) { - String key = ITEM_PREFIX + DigestUtils.shaHex(request.getUrl()); - byte[] bytes = JSON.toJSONString(request).getBytes(); - jedis.set(key.getBytes(), bytes); + try { + //使用Set进行url去重 + if (!jedis.sismember(SET_PREFIX + task.getUUID(), request.getUrl())) { + //使用List保存队列 + jedis.rpush(QUEUE_PREFIX + task.getUUID(), request.getUrl()); + jedis.sadd(SET_PREFIX + task.getUUID(), request.getUrl()); + if (request.getExtras() != null) { + String field = DigestUtils.shaHex(request.getUrl()); + String value = JSON.toJSONString(request); + jedis.hset((ITEM_PREFIX + task.getUUID()), field, value); + } } + } finally { + pool.returnResource(jedis); } - pool.returnResource(jedis); } @Override public synchronized Request poll(Task task) { Jedis jedis = pool.getResource(); - String url = jedis.lpop(QUEUE_PREFIX + task.getUUID()); - if (url == null) { - return null; - } - String key = ITEM_PREFIX + DigestUtils.shaHex(url); - byte[] bytes = jedis.get(key.getBytes()); - if (bytes != null) { - Request o = JSON.parseObject(new String(bytes),Request.class); - return o; + try { + String url = jedis.lpop(QUEUE_PREFIX + task.getUUID()); + if (url == null) { + return null; + } + String key = ITEM_PREFIX + task.getUUID(); + String field = DigestUtils.shaHex(url); + byte[] bytes = jedis.hget(key.getBytes(),field.getBytes()); + if (bytes != null) { + Request o = JSON.parseObject(new String(bytes), Request.class); + return o; + } + Request request = new Request(url); + return request; + } finally { + pool.returnResource(jedis); } - pool.returnResource(jedis); - return new Request(url); } }