From 19229dd855fad7ca846d189d3cc799d5bb95aea7 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Sat, 10 Aug 2013 08:27:14 +0800 Subject: [PATCH] add JsonFilePageModelPipeline --- .../us/codecraft/webmagic/model/HasKey.java | 20 ++++++ .../pipeline/JsonFilePageModelPipeline.java | 70 +++++++++++++++++++ .../webmagic/model/samples/OschinaBlog.java | 23 +++++- 3 files changed, 110 insertions(+), 3 deletions(-) create mode 100644 webmagic-extension/src/main/java/us/codecraft/webmagic/model/HasKey.java create mode 100644 webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/JsonFilePageModelPipeline.java diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/HasKey.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/HasKey.java new file mode 100644 index 00000000..dd9ace2d --- /dev/null +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/HasKey.java @@ -0,0 +1,20 @@ +package us.codecraft.webmagic.model; + +/** + * 标志一个Model的key。
+ * 实现了这个接口的Model在输出时会使用getKey()作为标志(例如JsonFilePageModelPipeline中持久化的文件名)。
+ * 如果持久化的文件名是乱码,请再运行的环境变量里加上LANG=zh_CN.UTF-8 。
+ * + * @author code4crafter@gmail.com
+ * Date: 13-8-10
+ * Time: 上午7:39
+ */ +public interface HasKey { + + /** + * 在输出时会使用key作为标志(例如JsonFilePageModelPipeline中持久化的文件名)。 + * + * @return key + */ + public String key(); +} diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/JsonFilePageModelPipeline.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/JsonFilePageModelPipeline.java new file mode 100644 index 00000000..a6b73ccf --- /dev/null +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/JsonFilePageModelPipeline.java @@ -0,0 +1,70 @@ +package us.codecraft.webmagic.pipeline; + +import com.alibaba.fastjson.JSON; +import org.apache.commons.codec.digest.DigestUtils; +import org.apache.commons.lang3.builder.ToStringBuilder; +import org.apache.log4j.Logger; +import us.codecraft.webmagic.Task; +import us.codecraft.webmagic.model.HasKey; +import us.codecraft.webmagic.model.PageModelPipeline; + +import java.io.File; +import java.io.FileWriter; +import java.io.IOException; +import java.io.PrintWriter; + +/** + * JSON格式持久化到文件的接口。
+ * 如果持久化的文件名是乱码,请再运行的环境变量里加上LANG=zh_CN.UTF-8。
+ * + * @author code4crafter@gmail.com
+ * Date: 13-4-21 + * Time: 下午6:28 + */ +public class JsonFilePageModelPipeline implements PageModelPipeline { + + private String path = "/data/webmagic/"; + + private Logger logger = Logger.getLogger(getClass()); + + /** + * 新建一个FilePipeline,使用默认保存路径"/data/webmagic/" + */ + public JsonFilePageModelPipeline() { + + } + + /** + * 新建一个FilePipeline + * + * @param path 文件保存路径 + */ + public JsonFilePageModelPipeline(String path) { + if (!path.endsWith("/") && !path.endsWith("\\")) { + path += "/"; + } + this.path = path; + } + + @Override + public void process(Object o, Task task) { + String path = this.path + "/" + task.getUUID() + "/"; + File file = new File(path); + if (!file.exists()) { + file.mkdirs(); + } + try { + String filename; + if (o instanceof HasKey) { + filename = path + ((HasKey)o).key() + ".json"; + } else { + filename = path + DigestUtils.md5Hex(ToStringBuilder.reflectionToString(o)) + ".json"; + } + PrintWriter printWriter = new PrintWriter(new FileWriter(filename)); + printWriter.write(JSON.toJSONString(o)); + printWriter.close(); + } catch (IOException e) { + logger.warn("write file error", e); + } + } +} diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/OschinaBlog.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/OschinaBlog.java index c1e3ea34..a76fd886 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/OschinaBlog.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/OschinaBlog.java @@ -1,10 +1,11 @@ package us.codecraft.webmagic.model.samples; import us.codecraft.webmagic.Site; -import us.codecraft.webmagic.model.ConsolePageModelPipeline; +import us.codecraft.webmagic.model.HasKey; import us.codecraft.webmagic.model.OOSpider; import us.codecraft.webmagic.model.annotation.ExtractBy; import us.codecraft.webmagic.model.annotation.TargetUrl; +import us.codecraft.webmagic.pipeline.JsonFilePageModelPipeline; import java.util.List; @@ -14,7 +15,7 @@ import java.util.List; * Time: 上午7:52
*/ @TargetUrl("http://my.oschina.net/flashsword/blog/\\d+") -public class OschinaBlog { +public class OschinaBlog implements HasKey{ @ExtractBy("//title") private String title; @@ -27,7 +28,23 @@ public class OschinaBlog { public static void main(String[] args) { OOSpider.create(Site.me().addStartUrl("http://my.oschina.net/flashsword/blog") - ,new ConsolePageModelPipeline(), OschinaBlog.class).run(); + ,new JsonFilePageModelPipeline(), OschinaBlog.class).run(); } + public String getTitle() { + return title; + } + + public String getContent() { + return content; + } + + public List getTags() { + return tags; + } + + @Override + public String key() { + return title; + } }