From 1148450ff945c82a07dd1e91df4e6cbd519dc65f Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Sat, 17 Aug 2013 18:12:47 +0800 Subject: [PATCH] update filecache to more useful --- .../processor/SimplePageProcessor.java | 4 +- .../webmagic/downloader/FileCache.java | 122 ++++++++++++++++++ .../webmagic/downloader/FileDownloader.java | 97 -------------- .../webmagic/downloader/FileCacheTest.java | 17 +++ .../downloader/FileDownloader-cmnt.xml | 2 +- 5 files changed, 141 insertions(+), 101 deletions(-) create mode 100644 webmagic-extension/src/main/java/us/codecraft/webmagic/downloader/FileCache.java delete mode 100644 webmagic-extension/src/main/java/us/codecraft/webmagic/downloader/FileDownloader.java create mode 100644 webmagic-extension/src/test/java/us/codecraft/webmagic/downloader/FileCacheTest.java diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/processor/SimplePageProcessor.java b/webmagic-core/src/main/java/us/codecraft/webmagic/processor/SimplePageProcessor.java index ff964605..285a63d7 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/processor/SimplePageProcessor.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/processor/SimplePageProcessor.java @@ -16,13 +16,11 @@ public class SimplePageProcessor implements PageProcessor { private String urlPattern; - private static final String UA = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31"; - private Site site; public SimplePageProcessor(String startUrl, String urlPattern) { this.site = Site.me().addStartUrl(startUrl). - setDomain(UrlUtils.getDomain(startUrl)).setUserAgent(UA); + setDomain(UrlUtils.getDomain(startUrl)); //compile "*" expression to regex this.urlPattern = "("+urlPattern.replace(".","\\.").replace("*","[^\"'#]*")+")"; diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/downloader/FileCache.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/downloader/FileCache.java new file mode 100644 index 00000000..bf324359 --- /dev/null +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/downloader/FileCache.java @@ -0,0 +1,122 @@ +package us.codecraft.webmagic.downloader; + +import org.apache.commons.codec.digest.DigestUtils; +import org.apache.commons.lang3.StringUtils; +import org.apache.log4j.Logger; +import us.codecraft.webmagic.*; +import us.codecraft.webmagic.pipeline.Pipeline; +import us.codecraft.webmagic.processor.PageProcessor; +import us.codecraft.webmagic.processor.SimplePageProcessor; +import us.codecraft.webmagic.selector.Html; +import us.codecraft.webmagic.selector.PlainText; +import us.codecraft.webmagic.utils.FilePersistentBase; +import us.codecraft.webmagic.utils.UrlUtils; + +import java.io.*; + +/** + * Download file and saved to file for cache.
+ * + * + * @author code4crafter@gmail.com + * @since 0.2.1 + */ +public class FileCache extends FilePersistentBase implements Downloader, Pipeline, PageProcessor { + + private Downloader downloaderWhenFileMiss; + + private final PageProcessor pageProcessor; + + private Logger logger = Logger.getLogger(getClass()); + + public FileCache(String startUrl, String urlPattern) { + this(startUrl, urlPattern, "/data/webmagic/temp/"); + } + + public FileCache(String startUrl, String urlPattern, String path) { + this.pageProcessor = new SimplePageProcessor(startUrl, urlPattern); + setPath(path); + downloaderWhenFileMiss = new HttpClientDownloader(); + } + + public FileCache setDownloaderWhenFileMiss(Downloader downloaderWhenFileMiss) { + this.downloaderWhenFileMiss = downloaderWhenFileMiss; + return this; + } + + @Override + public Page download(Request request, Task task) { + String path = this.path + "/" + task.getUUID() + "/"; + Page page = null; + try { + final File file = getFile(path + DigestUtils.md5Hex(request.getUrl())); + BufferedReader bufferedReader = new BufferedReader(new FileReader(file)); + String line = bufferedReader.readLine(); + if (line.equals("url:\t" + request.getUrl())) { + final String html = getHtml(bufferedReader); + page = new Page(); + page.setRequest(request); + page.setUrl(PlainText.create(request.getUrl())); + page.setHtml(Html.create(UrlUtils.fixAllRelativeHrefs(html, request.getUrl()))); + } + } catch (IOException e) { + if (e instanceof FileNotFoundException) { + logger.info("File not exist for url " + request.getUrl()); + } else { + logger.warn("File read error for url " + request.getUrl(), e); + } + } + if (page == null) { + page = downloadWhenMiss(request, task); + } + return page; + } + + @Override + public void setThread(int thread) { + + } + + private String getHtml(BufferedReader bufferedReader) throws IOException { + String line; + StringBuilder htmlBuilder = new StringBuilder(); + line = bufferedReader.readLine(); + line = StringUtils.removeStart(line, "html:\t"); + htmlBuilder.append(line); + while ((line = bufferedReader.readLine()) != null) { + htmlBuilder.append(line); + } + return htmlBuilder.toString(); + } + + private Page downloadWhenMiss(Request request, Task task) { + Page page = null; + if (downloaderWhenFileMiss != null) { + page = downloaderWhenFileMiss.download(request, task); + } + return page; + } + + @Override + public void process(ResultItems resultItems, Task task) { + String path = this.path + PATH_SEPERATOR + task.getUUID() + PATH_SEPERATOR; + try { + PrintWriter printWriter = new PrintWriter(new FileWriter(getFile(path + DigestUtils.md5Hex(resultItems.getRequest().getUrl()) + ".html"))); + printWriter.println("url:\t" + resultItems.getRequest().getUrl()); + printWriter.println("html:\t" + resultItems.get("html")); + printWriter.close(); + } catch (IOException e) { + logger.warn("write file error", e); + } + } + + @Override + public void process(Page page) { + pageProcessor.process(page); + } + + @Override + public Site getSite() { + return pageProcessor.getSite(); + } +} diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/downloader/FileDownloader.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/downloader/FileDownloader.java deleted file mode 100644 index cca5b206..00000000 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/downloader/FileDownloader.java +++ /dev/null @@ -1,97 +0,0 @@ -package us.codecraft.webmagic.downloader; - -import org.apache.commons.codec.digest.DigestUtils; -import org.apache.commons.lang3.StringUtils; -import org.apache.log4j.Logger; -import us.codecraft.webmagic.Page; -import us.codecraft.webmagic.Request; -import us.codecraft.webmagic.Task; -import us.codecraft.webmagic.selector.Html; -import us.codecraft.webmagic.selector.PlainText; - -import java.io.*; - -/** - * 使用缓存到本地的文件来模拟下载,可以在Spider框架中仅进行抽取工作。
- * @author code4crafer@gmail.com - * Date: 13-6-24 - * Time: 上午7:24 - */ -public class FileDownloader implements Downloader { - - private String path = "/data/temp/webmagic/"; - - private Downloader downloaderWhenFileMiss; - - private Logger logger = Logger.getLogger(getClass()); - - public FileDownloader() { - this("/data/temp/webmagic/", null); - } - - public FileDownloader(String path) { - this(path, null); - } - - public FileDownloader(String path, Downloader downloaderWhenFileMiss) { - if (!path.endsWith("/")&&!path.endsWith("\\")){ - path+="/"; - } - this.path = path; - this.downloaderWhenFileMiss = downloaderWhenFileMiss; - } - - @Override - public Page download(Request request, Task task) { - String path = this.path + "/" + task.getUUID() + "/"; - Page page = null; - try { - final File file = new File(path + DigestUtils.md5Hex(request.getUrl())); - BufferedReader bufferedReader = new BufferedReader(new FileReader(file)); - String line = null; - line = bufferedReader.readLine(); - if (line.equals("url:\t" + request.getUrl())) { - final String html = getHtml(bufferedReader); - page = new Page(); - page.setRequest(request); - page.setUrl(PlainText.create(request.getUrl())); - page.setHtml(Html.create(html)); - } - } catch (IOException e) { - if (e instanceof FileNotFoundException) { - logger.info("File not exist for url " + request.getUrl()); - } else { - logger.warn("File read error for url " + request.getUrl(), e); - } - } - if (page == null) { - page = downloadWhenMiss(request, task); - } - return page; - } - - @Override - public void setThread(int thread) { - - } - - private String getHtml(BufferedReader bufferedReader) throws IOException { - String line; - StringBuilder htmlBuilder= new StringBuilder(); - line = bufferedReader.readLine(); - line = StringUtils.removeStart(line, "html:\t"); - htmlBuilder.append(line); - while ((line=bufferedReader.readLine())!=null){ - htmlBuilder.append(line); - } - return htmlBuilder.toString(); - } - - private Page downloadWhenMiss(Request request, Task task) { - Page page = null; - if (downloaderWhenFileMiss != null) { - page = downloaderWhenFileMiss.download(request, task); - } - return page; - } -} diff --git a/webmagic-extension/src/test/java/us/codecraft/webmagic/downloader/FileCacheTest.java b/webmagic-extension/src/test/java/us/codecraft/webmagic/downloader/FileCacheTest.java new file mode 100644 index 00000000..fc3debfa --- /dev/null +++ b/webmagic-extension/src/test/java/us/codecraft/webmagic/downloader/FileCacheTest.java @@ -0,0 +1,17 @@ +package us.codecraft.webmagic.downloader; + +import org.junit.Test; +import us.codecraft.webmagic.Spider; + +/** + * @author code4crafter@gmail.com
+ */ +public class FileCacheTest { + +// @Ignore("takes long") + @Test + public void test() { + FileCache fileCache = new FileCache("http://my.oschina.net/flashsword/blog", "http://my.oschina.net/flashsword/blog/*"); + Spider.create(fileCache).downloader(fileCache).pipeline(fileCache).run(); + } +} diff --git a/zh_docs/us/codecraft/webmagic/downloader/FileDownloader-cmnt.xml b/zh_docs/us/codecraft/webmagic/downloader/FileDownloader-cmnt.xml index a0490623..bd0d51b6 100644 --- a/zh_docs/us/codecraft/webmagic/downloader/FileDownloader-cmnt.xml +++ b/zh_docs/us/codecraft/webmagic/downloader/FileDownloader-cmnt.xml @@ -4,7 +4,7 @@ Sat Aug 17 14:14:45 CST 2013 - + @author code4crafer@gmail.com Date: 13-6-24