From c59c1fe80d69d3a5f6a24e95b3b9a0a50bcf29b4 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Sat, 17 Aug 2013 19:19:27 +0800 Subject: [PATCH] update comments --- .../us/codecraft/webmagic/MultiPageModel.java | 34 ++++++++++++++++--- .../webmagic/model/AfterExtractor.java | 2 -- .../us/codecraft/webmagic/model/OOSpider.java | 22 ++++++++++-- .../us/codecraft/webmagic/model/package.html | 2 +- .../webmagic/utils/MultiKeyMapBase.java | 1 - 5 files changed, 49 insertions(+), 12 deletions(-) diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/MultiPageModel.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/MultiPageModel.java index 2e1b713f..e3411fc7 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/MultiPageModel.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/MultiPageModel.java @@ -1,21 +1,45 @@ package us.codecraft.webmagic; +import us.codecraft.webmagic.model.annotation.Experimental; + import java.util.Collection; /** - * 实现此接口以进行支持爬虫分页抓取。
+ * Extract an object of more than one pages, such as news and articles。
+ * * @author code4crafter@gmail.com
- * Date: 13-8-4
- * Time: 下午5:18
*/ +@Experimental public interface MultiPageModel { + /** + * Page key is the identifier for the object. + * + * @return page key + */ public String getPageKey(); - public Collection getOtherPages(); - + /** + * page is the identifier of a page in pages for one object. + * + * @return page + */ public String getPage(); + /** + * other pages to be extracted.
+ * It is used to judge whether an object contains more than one page, and whether the pages of the object are all extracted. + * + * @return other pages + */ + public Collection getOtherPages(); + + /** + * Combine multiPageModels to a whole object. + * + * @param multiPageModel + * @return multiPageModel combined + */ public MultiPageModel combine(MultiPageModel multiPageModel); } diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/AfterExtractor.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/AfterExtractor.java index 3927d116..5b743095 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/AfterExtractor.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/AfterExtractor.java @@ -6,8 +6,6 @@ import us.codecraft.webmagic.Page; * 实现这个接口即可在抽取后进行后处理。
* * @author code4crafter@gmail.com
- * Date: 13-8-3
- * Time: 上午9:42
*/ public interface AfterExtractor { diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/OOSpider.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/OOSpider.java index 977dcde8..e04a30d8 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/OOSpider.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/OOSpider.java @@ -5,10 +5,26 @@ import us.codecraft.webmagic.Spider; import us.codecraft.webmagic.processor.PageProcessor; /** - * 基于Model的Spider,封装后的入口类。
+ * The spider for page model extractor。
+ * In webmagic, we call a POJO containing extract result as "page model".
+ * You can customize a crawler by write a page model with annotations.
+ * Such as: + *
+ * {@literal @}TargetUrl("http://my.oschina.net/flashsword/blog/\\d+")
+ *  public class OschinaBlog{
+ *
+ *      {@literal @}ExtractBy("//title")
+ *      private String title;
+ *
+ *      {@literal @}ExtractBy(value = "div.BlogContent",type = ExtractBy.Type.Css)
+ *      private String content;
+ *
+ *      {@literal @}ExtractBy(value = "//div[@class='BlogTags']/a/text()", multi = true)
+ *      private List tags;
+ * }
+ 
* @author code4crafter@gmail.com
- * Date: 13-8-3
- * Time: 上午9:51
+ * @since 0.2.0 */ public class OOSpider extends Spider { diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/package.html b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/package.html index d62cc002..63a6784c 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/package.html +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/package.html @@ -1,5 +1,5 @@ -webmagic对抓取器编写的面向模型(称为PageModel)的封装。基于POJO及注解即可实现一个PageProcessor。 +Page model and annotations used to customize a crawler. diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/utils/MultiKeyMapBase.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/utils/MultiKeyMapBase.java index a7d8378e..d0537163 100755 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/utils/MultiKeyMapBase.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/utils/MultiKeyMapBase.java @@ -2,7 +2,6 @@ package us.codecraft.webmagic.utils; /** * @author code4crafter@gmail.com - * Date Dec 14, 2012 */ import java.util.HashMap;