diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/processor/SimplePageProcessor.java b/webmagic-core/src/main/java/us/codecraft/webmagic/processor/SimplePageProcessor.java
index ff964605..285a63d7 100644
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/processor/SimplePageProcessor.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/processor/SimplePageProcessor.java
@@ -16,13 +16,11 @@ public class SimplePageProcessor implements PageProcessor {
private String urlPattern;
- private static final String UA = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31";
-
private Site site;
public SimplePageProcessor(String startUrl, String urlPattern) {
this.site = Site.me().addStartUrl(startUrl).
- setDomain(UrlUtils.getDomain(startUrl)).setUserAgent(UA);
+ setDomain(UrlUtils.getDomain(startUrl));
//compile "*" expression to regex
this.urlPattern = "("+urlPattern.replace(".","\\.").replace("*","[^\"'#]*")+")";
diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/downloader/FileCache.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/downloader/FileCache.java
new file mode 100644
index 00000000..bf324359
--- /dev/null
+++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/downloader/FileCache.java
@@ -0,0 +1,122 @@
+package us.codecraft.webmagic.downloader;
+
+import org.apache.commons.codec.digest.DigestUtils;
+import org.apache.commons.lang3.StringUtils;
+import org.apache.log4j.Logger;
+import us.codecraft.webmagic.*;
+import us.codecraft.webmagic.pipeline.Pipeline;
+import us.codecraft.webmagic.processor.PageProcessor;
+import us.codecraft.webmagic.processor.SimplePageProcessor;
+import us.codecraft.webmagic.selector.Html;
+import us.codecraft.webmagic.selector.PlainText;
+import us.codecraft.webmagic.utils.FilePersistentBase;
+import us.codecraft.webmagic.utils.UrlUtils;
+
+import java.io.*;
+
+/**
+ * Download file and saved to file for cache.
+ *
+ *
+ * @author code4crafter@gmail.com
+ * @since 0.2.1
+ */
+public class FileCache extends FilePersistentBase implements Downloader, Pipeline, PageProcessor {
+
+ private Downloader downloaderWhenFileMiss;
+
+ private final PageProcessor pageProcessor;
+
+ private Logger logger = Logger.getLogger(getClass());
+
+ public FileCache(String startUrl, String urlPattern) {
+ this(startUrl, urlPattern, "/data/webmagic/temp/");
+ }
+
+ public FileCache(String startUrl, String urlPattern, String path) {
+ this.pageProcessor = new SimplePageProcessor(startUrl, urlPattern);
+ setPath(path);
+ downloaderWhenFileMiss = new HttpClientDownloader();
+ }
+
+ public FileCache setDownloaderWhenFileMiss(Downloader downloaderWhenFileMiss) {
+ this.downloaderWhenFileMiss = downloaderWhenFileMiss;
+ return this;
+ }
+
+ @Override
+ public Page download(Request request, Task task) {
+ String path = this.path + "/" + task.getUUID() + "/";
+ Page page = null;
+ try {
+ final File file = getFile(path + DigestUtils.md5Hex(request.getUrl()));
+ BufferedReader bufferedReader = new BufferedReader(new FileReader(file));
+ String line = bufferedReader.readLine();
+ if (line.equals("url:\t" + request.getUrl())) {
+ final String html = getHtml(bufferedReader);
+ page = new Page();
+ page.setRequest(request);
+ page.setUrl(PlainText.create(request.getUrl()));
+ page.setHtml(Html.create(UrlUtils.fixAllRelativeHrefs(html, request.getUrl())));
+ }
+ } catch (IOException e) {
+ if (e instanceof FileNotFoundException) {
+ logger.info("File not exist for url " + request.getUrl());
+ } else {
+ logger.warn("File read error for url " + request.getUrl(), e);
+ }
+ }
+ if (page == null) {
+ page = downloadWhenMiss(request, task);
+ }
+ return page;
+ }
+
+ @Override
+ public void setThread(int thread) {
+
+ }
+
+ private String getHtml(BufferedReader bufferedReader) throws IOException {
+ String line;
+ StringBuilder htmlBuilder = new StringBuilder();
+ line = bufferedReader.readLine();
+ line = StringUtils.removeStart(line, "html:\t");
+ htmlBuilder.append(line);
+ while ((line = bufferedReader.readLine()) != null) {
+ htmlBuilder.append(line);
+ }
+ return htmlBuilder.toString();
+ }
+
+ private Page downloadWhenMiss(Request request, Task task) {
+ Page page = null;
+ if (downloaderWhenFileMiss != null) {
+ page = downloaderWhenFileMiss.download(request, task);
+ }
+ return page;
+ }
+
+ @Override
+ public void process(ResultItems resultItems, Task task) {
+ String path = this.path + PATH_SEPERATOR + task.getUUID() + PATH_SEPERATOR;
+ try {
+ PrintWriter printWriter = new PrintWriter(new FileWriter(getFile(path + DigestUtils.md5Hex(resultItems.getRequest().getUrl()) + ".html")));
+ printWriter.println("url:\t" + resultItems.getRequest().getUrl());
+ printWriter.println("html:\t" + resultItems.get("html"));
+ printWriter.close();
+ } catch (IOException e) {
+ logger.warn("write file error", e);
+ }
+ }
+
+ @Override
+ public void process(Page page) {
+ pageProcessor.process(page);
+ }
+
+ @Override
+ public Site getSite() {
+ return pageProcessor.getSite();
+ }
+}
diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/downloader/FileDownloader.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/downloader/FileDownloader.java
deleted file mode 100644
index cca5b206..00000000
--- a/webmagic-extension/src/main/java/us/codecraft/webmagic/downloader/FileDownloader.java
+++ /dev/null
@@ -1,97 +0,0 @@
-package us.codecraft.webmagic.downloader;
-
-import org.apache.commons.codec.digest.DigestUtils;
-import org.apache.commons.lang3.StringUtils;
-import org.apache.log4j.Logger;
-import us.codecraft.webmagic.Page;
-import us.codecraft.webmagic.Request;
-import us.codecraft.webmagic.Task;
-import us.codecraft.webmagic.selector.Html;
-import us.codecraft.webmagic.selector.PlainText;
-
-import java.io.*;
-
-/**
- * 使用缓存到本地的文件来模拟下载,可以在Spider框架中仅进行抽取工作。
- * @author code4crafer@gmail.com
- * Date: 13-6-24
- * Time: 上午7:24
- */
-public class FileDownloader implements Downloader {
-
- private String path = "/data/temp/webmagic/";
-
- private Downloader downloaderWhenFileMiss;
-
- private Logger logger = Logger.getLogger(getClass());
-
- public FileDownloader() {
- this("/data/temp/webmagic/", null);
- }
-
- public FileDownloader(String path) {
- this(path, null);
- }
-
- public FileDownloader(String path, Downloader downloaderWhenFileMiss) {
- if (!path.endsWith("/")&&!path.endsWith("\\")){
- path+="/";
- }
- this.path = path;
- this.downloaderWhenFileMiss = downloaderWhenFileMiss;
- }
-
- @Override
- public Page download(Request request, Task task) {
- String path = this.path + "/" + task.getUUID() + "/";
- Page page = null;
- try {
- final File file = new File(path + DigestUtils.md5Hex(request.getUrl()));
- BufferedReader bufferedReader = new BufferedReader(new FileReader(file));
- String line = null;
- line = bufferedReader.readLine();
- if (line.equals("url:\t" + request.getUrl())) {
- final String html = getHtml(bufferedReader);
- page = new Page();
- page.setRequest(request);
- page.setUrl(PlainText.create(request.getUrl()));
- page.setHtml(Html.create(html));
- }
- } catch (IOException e) {
- if (e instanceof FileNotFoundException) {
- logger.info("File not exist for url " + request.getUrl());
- } else {
- logger.warn("File read error for url " + request.getUrl(), e);
- }
- }
- if (page == null) {
- page = downloadWhenMiss(request, task);
- }
- return page;
- }
-
- @Override
- public void setThread(int thread) {
-
- }
-
- private String getHtml(BufferedReader bufferedReader) throws IOException {
- String line;
- StringBuilder htmlBuilder= new StringBuilder();
- line = bufferedReader.readLine();
- line = StringUtils.removeStart(line, "html:\t");
- htmlBuilder.append(line);
- while ((line=bufferedReader.readLine())!=null){
- htmlBuilder.append(line);
- }
- return htmlBuilder.toString();
- }
-
- private Page downloadWhenMiss(Request request, Task task) {
- Page page = null;
- if (downloaderWhenFileMiss != null) {
- page = downloaderWhenFileMiss.download(request, task);
- }
- return page;
- }
-}
diff --git a/webmagic-extension/src/test/java/us/codecraft/webmagic/downloader/FileCacheTest.java b/webmagic-extension/src/test/java/us/codecraft/webmagic/downloader/FileCacheTest.java
new file mode 100644
index 00000000..fc3debfa
--- /dev/null
+++ b/webmagic-extension/src/test/java/us/codecraft/webmagic/downloader/FileCacheTest.java
@@ -0,0 +1,17 @@
+package us.codecraft.webmagic.downloader;
+
+import org.junit.Test;
+import us.codecraft.webmagic.Spider;
+
+/**
+ * @author code4crafter@gmail.com
+ */
+public class FileCacheTest {
+
+// @Ignore("takes long")
+ @Test
+ public void test() {
+ FileCache fileCache = new FileCache("http://my.oschina.net/flashsword/blog", "http://my.oschina.net/flashsword/blog/*");
+ Spider.create(fileCache).downloader(fileCache).pipeline(fileCache).run();
+ }
+}
diff --git a/zh_docs/us/codecraft/webmagic/downloader/FileDownloader-cmnt.xml b/zh_docs/us/codecraft/webmagic/downloader/FileDownloader-cmnt.xml
index a0490623..bd0d51b6 100644
--- a/zh_docs/us/codecraft/webmagic/downloader/FileDownloader-cmnt.xml
+++ b/zh_docs/us/codecraft/webmagic/downloader/FileDownloader-cmnt.xml
@@ -4,7 +4,7 @@
Sat Aug 17 14:14:45 CST 2013
-
+
@author code4crafer@gmail.com
Date: 13-6-24