From 5cb45af3a45f918ac0b69784c9f13cb8824755f8 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Sat, 17 Aug 2013 12:10:34 +0800 Subject: [PATCH] +doc --- .../main/java/us/codecraft/webmagic/Page.java | 15 +++++++++++++-- .../java/us/codecraft/webmagic/Request.java | 2 ++ .../java/us/codecraft/webmagic/package.html | 5 +++++ .../model/annotation/ComboExtract.java | 18 ++++++++++++++++++ 4 files changed, 38 insertions(+), 2 deletions(-) create mode 100644 webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ComboExtract.java diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java index fd881b29..a894269b 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java @@ -8,7 +8,7 @@ import java.util.ArrayList; import java.util.List; /** - *
+ * 
  * Page保存了上一次抓取的结果,并可定义待抓取的链接内容。
  *
  *     主要方法:
@@ -19,6 +19,17 @@ import java.util.List;
  *     {@link #addTargetRequests(java.util.List)} {@link #addTargetRequest(String)} 添加待抓取的链接
  *
  * 
+ *
+ * Store extracted result and urls to be crawled.
+ *
+ *     Main method:
+ *     {@link #getUrl()} get url of current page
+ *     {@link #getHtml()}  get content of current page
+ *     {@link #putField(String, Object)}  save extracted result
+ *     {@link #getResultItems()} get extract results to be used in {@link us.codecraft.webmagic.pipeline.Pipeline}
+ *     {@link #addTargetRequests(java.util.List)} {@link #addTargetRequest(String)} add urls to crawl
+ *
+ * 
* * @author code4crafter@gmail.com
*/ @@ -44,7 +55,7 @@ public class Page { } /** - * 保存抽取的结果 + * * * @param key 结果的key * @param field 结果的value diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java index 7a6e557c..b9b8ddf6 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java @@ -5,6 +5,7 @@ import java.util.HashMap; import java.util.Map; /** + *
* Request对象封装了待抓取的url信息。
* 在PageProcessor中,Request对象可以通过{@link us.codecraft.webmagic.Page#getRequest()} 获取。
*
@@ -22,6 +23,7 @@ import java.util.Map; * String linktext = (String)page.getRequest().getExtra()[0]; * } *
+ * * * @author code4crafter@gmail.com
* Date: 13-4-21 diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/package.html b/webmagic-core/src/main/java/us/codecraft/webmagic/package.html index d5ff540a..05328dcb 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/package.html +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/package.html @@ -1,5 +1,10 @@ +
+ Main class "Spider" and models. +
+
包括webmagic入口类Spider和一些数据传递的实体类。 +
diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ComboExtract.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ComboExtract.java new file mode 100644 index 00000000..1f5f008c --- /dev/null +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ComboExtract.java @@ -0,0 +1,18 @@ +package us.codecraft.webmagic.model.annotation; + +import java.lang.annotation.ElementType; +import java.lang.annotation.Retention; +import java.lang.annotation.Target; + +/** + * @author code4crafter@gmail.com
+ * Date: 13-8-16
+ * Time: 下午11:09
+ */ +@Retention(java.lang.annotation.RetentionPolicy.RUNTIME) +@Target({ElementType.FIELD, ElementType.TYPE}) +public @interface ComboExtract { + + + +}