update comments

pull/17/head
yihua.huang 12 years ago
parent 6cc1d62a08
commit 5f1f4cbc46

@ -8,30 +8,19 @@ import java.util.ArrayList;
import java.util.List; import java.util.List;
/** /**
* <pre class="zh">
* Page
* *
* * Object storing extracted result and urls to be crawled.<br>
* {@link #getUrl()} Url * Main method <br>
* {@link #getHtml()} html * {@link #getUrl()} get url of current page <br>
* {@link #putField(String, Object)} * {@link #getHtml()} get content of current page <br>
* {@link #getResultItems()} {@link us.codecraft.webmagic.pipeline.Pipeline} * {@link #putField(String, Object)} save extracted result <br>
* {@link #addTargetRequests(java.util.List)} {@link #addTargetRequest(String)} * {@link #getResultItems()} get extract results to be used in {@link us.codecraft.webmagic.pipeline.Pipeline}<br>
* * {@link #addTargetRequests(java.util.List)} {@link #addTargetRequest(String)} add urls to crawl <br>
* </pre>
* <pre class="en">
* Store extracted result and urls to be crawled.
*
* Main method
* {@link #getUrl()} get url of current page
* {@link #getHtml()} get content of current page
* {@link #putField(String, Object)} save extracted result
* {@link #getResultItems()} get extract results to be used in {@link us.codecraft.webmagic.pipeline.Pipeline}
* {@link #addTargetRequests(java.util.List)} {@link #addTargetRequest(String)} add urls to crawl
*
* </pre>
* *
* @author code4crafter@gmail.com <br> * @author code4crafter@gmail.com <br>
* @since 0.1.0
* @see us.codecraft.webmagic.downloader.Downloader
* @see us.codecraft.webmagic.processor.PageProcessor
*/ */
public class Page { public class Page {
@ -55,19 +44,19 @@ public class Page {
} }
/** /**
* store extract results
* *
* * @param key
* @param key key * @param field
* @param field value
*/ */
public void putField(String key, Object field) { public void putField(String key, Object field) {
resultItems.put(key, field); resultItems.put(key, field);
} }
/** /**
* html * get html content of page
* *
* @return html html * @return html
*/ */
public Selectable getHtml() { public Selectable getHtml() {
return html; return html;
@ -82,9 +71,9 @@ public class Page {
} }
/** /**
* * add urls to crawl
* *
* @param requests * @param requests
*/ */
public void addTargetRequests(List<String> requests) { public void addTargetRequests(List<String> requests) {
synchronized (targetRequests) { synchronized (targetRequests) {
@ -99,9 +88,9 @@ public class Page {
} }
/** /**
* * add url to crawl
* *
* @param requestString * @param requestString
*/ */
public void addTargetRequest(String requestString) { public void addTargetRequest(String requestString) {
if (StringUtils.isBlank(requestString) || requestString.equals("#")) { if (StringUtils.isBlank(requestString) || requestString.equals("#")) {
@ -114,9 +103,9 @@ public class Page {
} }
/** /**
* 使 * add requests to crawl
* *
* @param request * @param request
*/ */
public void addTargetRequest(Request request) { public void addTargetRequest(Request request) {
synchronized (targetRequests) { synchronized (targetRequests) {
@ -125,27 +114,22 @@ public class Page {
} }
/** /**
* Url * get url of current page
* *
* @return url url * @return url of current page
*/ */
public Selectable getUrl() { public Selectable getUrl() {
return url; return url;
} }
/**
* url
*
* @param url
*/
public void setUrl(Selectable url) { public void setUrl(Selectable url) {
this.url = url; this.url = url;
} }
/** /**
* * get request of current page
* *
* @return request * @return request
*/ */
public Request getRequest() { public Request getRequest() {
return request; return request;

@ -1,33 +1,17 @@
package us.codecraft.webmagic; package us.codecraft.webmagic;
import us.codecraft.webmagic.utils.Experimental;
import java.io.Serializable; import java.io.Serializable;
import java.util.HashMap; import java.util.HashMap;
import java.util.Map; import java.util.Map;
/** /**
* <div class="zh"> * Object contains url to crawl.<br>
* Requesturl<br/> * It contains some additional information.<br>
* PageProcessorRequest{@link us.codecraft.webmagic.Page#getRequest()} <br/>
* <br/>
* Requestextra<br/>
* <pre>
* Example:
* <a href="${link}">${linktext}</a>linklinktext
*
* public void process(Page page){
* Request request = new Request(link,linktext);
* page.addTargetRequest(request)
* }
*
* public void process(Page page){
* String linktext = (String)page.getRequest().getExtra()[0];
* }
* </pre>
* </div>
* *
* @author code4crafter@gmail.com <br> * @author code4crafter@gmail.com <br>
* Date: 13-4-21 * @since 0.1.0
* Time: 11:37
*/ */
public class Request implements Serializable { public class Request implements Serializable {
@ -36,20 +20,22 @@ public class Request implements Serializable {
private String url; private String url;
/** /**
* * Store additional information in extras.
*/ */
private Map<String, Object> extras; private Map<String, Object> extras;
/**
* Priority of the request.<br>
* The bigger will be processed earlier. <br>
* Need a scheduler supporting priority.<br>
* But no scheduler in webmagic supporting priority now (:
*/
@Experimental
private double priority; private double priority;
public Request() { public Request() {
} }
/**
* request
*
* @param url url
*/
public Request(String url) { public Request(String url) {
this.url = url; this.url = url;
} }
@ -59,12 +45,14 @@ public class Request implements Serializable {
} }
/** /**
* URL<br> * Set the priority of request for sorting.<br>
* Scheduler<br> * Need a scheduler supporting priority.<br>
* Scheduler == <br> * But no scheduler in webmagic supporting priority now (:
* @param priority *
* @param priority
* @return this * @return this
*/ */
@Experimental
public Request setPriority(double priority) { public Request setPriority(double priority) {
this.priority = priority; this.priority = priority;
return this; return this;
@ -85,11 +73,6 @@ public class Request implements Serializable {
return this; return this;
} }
/**
* url
*
* @return url url
*/
public String getUrl() { public String getUrl() {
return url; return url;
} }

@ -4,10 +4,13 @@ import java.util.HashMap;
import java.util.Map; import java.util.Map;
/** /**
* PageProcessor{@link us.codecraft.webmagic.pipeline.Pipeline}<br> * Object contains extract results.<br>
* It is contained in Page and will be processed in pipeline.
*
* @author code4crafter@gmail.com <br> * @author code4crafter@gmail.com <br>
* Date: 13-7-25 <br> * @since 0.1.0
* Time: 12:20 <br> * @see Page
* @see us.codecraft.webmagic.pipeline.Pipeline
*/ */
public class ResultItems { public class ResultItems {
@ -25,7 +28,7 @@ public class ResultItems {
return (T) fields.get(key); return (T) fields.get(key);
} }
public Map<String, Object> getAll() { public Map<String, Object> getAll() {
return fields; return fields;
} }
@ -44,8 +47,10 @@ public class ResultItems {
} }
/** /**
* pipeline * Whether to skip the result.<br>
* @return true * Result which is skipped will not be processed by Pipeline.
*
* @return whether to skip the result
*/ */
public boolean isSkip() { public boolean isSkip() {
return skip; return skip;
@ -53,8 +58,10 @@ public class ResultItems {
/** /**
* pipeline * Set whether to skip the result.<br>
* @param skip * Result which is skipped will not be processed by Pipeline.
*
* @param skip whether to skip the result
* @return this * @return this
*/ */
public ResultItems setSkip(boolean skip) { public ResultItems setSkip(boolean skip) {

@ -5,12 +5,11 @@ import us.codecraft.webmagic.utils.UrlUtils;
import java.util.*; import java.util.*;
/** /**
* Site<br> * Object contains setting for crawler.<br>
* getter<br>
* *
* @author code4crafter@gmail.com <br> * @author code4crafter@gmail.com <br>
* Date: 13-4-21 * @since 0.1.0
* Time: 12:13 * @see us.codecraft.webmagic.processor.PageProcessor
*/ */
public class Site { public class Site {
@ -22,6 +21,9 @@ public class Site {
private String charset; private String charset;
/**
* startUrls is the urls the crawler to start with.
*/
private List<String> startUrls = new ArrayList<String>(); private List<String> startUrls = new ArrayList<String>();
private int sleepTime = 3000; private int sleepTime = 3000;
@ -37,19 +39,19 @@ public class Site {
} }
/** /**
* Sitenew Site() * new a Site
* *
* @return * @return new site
*/ */
public static Site me() { public static Site me() {
return new Site(); return new Site();
} }
/** /**
* cookie访cookie{@link #getDomain()} * Add a cookie with domain {@link #getDomain()}
* *
* @param name cookie * @param name
* @param value cookie * @param value
* @return this * @return this
*/ */
public Site addCookie(String name, String value) { public Site addCookie(String name, String value) {
@ -58,7 +60,7 @@ public class Site {
} }
/** /**
* user-agentuser-agent * set user agent
* *
* @param userAgent userAgent * @param userAgent userAgent
* @return this * @return this
@ -69,27 +71,27 @@ public class Site {
} }
/** /**
* cookie * get cookies
* *
* @return cookie * @return get cookies
*/ */
public Map<String, String> getCookies() { public Map<String, String> getCookies() {
return cookies; return cookies;
} }
/** /**
* user-agent * get user agent
* *
* @return user-agent * @return user agent
*/ */
public String getUserAgent() { public String getUserAgent() {
return userAgent; return userAgent;
} }
/** /**
* domain * get domain
* *
* @return domain * @return get domain
*/ */
public String getDomain() { public String getDomain() {
if (domain == null) { if (domain == null) {
@ -101,10 +103,9 @@ public class Site {
} }
/** /**
* <br> * set the domain of site.
* Spider
* *
* @param domain * @param domain
* @return this * @return this
*/ */
public Site setDomain(String domain) { public Site setDomain(String domain) {
@ -113,10 +114,10 @@ public class Site {
} }
/** /**
* Html meta<br> * Set charset of page manually.<br>
* encoding<br> * When charset is not set or set to null, it can be auto detected by Http header.
* *
* @param charset "utf-8""gbk" * @param charset
* @return this * @return this
*/ */
public Site setCharset(String charset) { public Site setCharset(String charset) {
@ -125,20 +126,21 @@ public class Site {
} }
/** /**
* * get charset set manually
* *
* @return domain * @return charset
*/ */
public String getCharset() { public String getCharset() {
return charset; return charset;
} }
/** /**
* http<br> * Set acceptStatCode.<br>
* 200<br> * When status code of http response is in acceptStatCodes, it will be processed.<br>
* <br> * {200} by default.<br>
* It is not necessarily to be set.<br>
* *
* @param acceptStatCode * @param acceptStatCode
* @return this * @return this
*/ */
public Site setAcceptStatCode(Set<Integer> acceptStatCode) { public Site setAcceptStatCode(Set<Integer> acceptStatCode) {
@ -147,27 +149,27 @@ public class Site {
} }
/** /**
* * get acceptStatCode
* *
* @return * @return acceptStatCode
*/ */
public Set<Integer> getAcceptStatCode() { public Set<Integer> getAcceptStatCode() {
return acceptStatCode; return acceptStatCode;
} }
/** /**
* * get start urls
* *
* @return * @return start urls
*/ */
public List<String> getStartUrls() { public List<String> getStartUrls() {
return startUrls; return startUrls;
} }
/** /**
* * Add a url to start url.<br>
* *
* @param startUrl * @param startUrl
* @return this * @return this
*/ */
public Site addStartUrl(String startUrl) { public Site addStartUrl(String startUrl) {
@ -176,9 +178,10 @@ public class Site {
} }
/** /**
* (...) * Set the interval between the processing of two pages.<br>
* Time unit is micro seconds.<br>
* *
* @param sleepTime * @param sleepTime
* @return this * @return this
*/ */
public Site setSleepTime(int sleepTime) { public Site setSleepTime(int sleepTime) {
@ -187,25 +190,26 @@ public class Site {
} }
/** /**
* * Get the interval between the processing of two pages.<br>
* Time unit is micro seconds.<br>
* *
* @return * @return the interval between the processing of two pages,
*/ */
public int getSleepTime() { public int getSleepTime() {
return sleepTime; return sleepTime;
} }
/** /**
* 0 * Get retry times when download fail, 0 by default.<br>
* *
* @return * @return retry times when download fail
*/ */
public int getRetryTimes() { public int getRetryTimes() {
return retryTimes; return retryTimes;
} }
/** /**
* 0 * Set retry times when download fail, 0 by default.<br>
* *
* @return this * @return this
*/ */

@ -18,25 +18,30 @@ import java.util.concurrent.ExecutorService;
import java.util.concurrent.atomic.AtomicInteger; import java.util.concurrent.atomic.AtomicInteger;
/** /**
* <pre> * Entrance of a crawler.<br>
* webmagic * A spider contains four modules: Downloader, Scheduler, PageProcessor and Pipeline.<br>
* * Every module is a field of Spider. <br>
* * The modules are defined in interface. <br>
* * You can customize a spider with various implementations of them. <br>
* Spider.create(new SimplePageProcessor("http://my.oschina.net/", "http://my.oschina.net/*blog/*")).run(); * Examples: <br>
* * <br>
* 使FilePipeline: * A simple crawler: <br>
* Spider.create(new SimplePageProcessor("http://my.oschina.net/", "http://my.oschina.net/*blog/*")) * Spider.create(new SimplePageProcessor("http://my.oschina.net/", "http://my.oschina.net/*blog/*")).run();<br>
* .pipeline(new FilePipeline("/data/temp/webmagic/")).run(); * <br>
* * Store results to files by FilePipeline: <br>
* 使FileCacheQueueSchedulerURL: * Spider.create(new SimplePageProcessor("http://my.oschina.net/", "http://my.oschina.net/*blog/*")) <br>
* Spider.create(new SimplePageProcessor("http://my.oschina.net/", "http://my.oschina.net/*blog/*")) * .pipeline(new FilePipeline("/data/temp/webmagic/")).run(); <br>
* .scheduler(new FileCacheQueueScheduler("/data/temp/webmagic/cache/")).run(); * <br>
* </pre> * Use FileCacheQueueScheduler to store urls and cursor in files, so that a Spider can resume the status when shutdown. <br>
* Spider.create(new SimplePageProcessor("http://my.oschina.net/", "http://my.oschina.net/*blog/*")) <br>
* .scheduler(new FileCacheQueueScheduler("/data/temp/webmagic/cache/")).run(); <br>
* *
* @author code4crafter@gmail.com <br> * @author code4crafter@gmail.com <br>
* Date: 13-4-21 * @see Downloader
* Time: 6:53 * @see Scheduler
* @see PageProcessor
* @see Pipeline
* @since 0.1.0
*/ */
public class Spider implements Runnable, Task { public class Spider implements Runnable, Task {
@ -222,11 +227,12 @@ public class Spider implements Runnable, Task {
/** /**
* URL * URL
*
* @param urls url * @param urls url
*/ */
public void test(String... urls){ public void test(String... urls) {
checkComponent(); checkComponent();
if (urls.length>0){ if (urls.length > 0) {
for (String url : urls) { for (String url : urls) {
processRequest(new Request(url)); processRequest(new Request(url));
} }
@ -241,7 +247,7 @@ public class Spider implements Runnable, Task {
} }
pageProcessor.process(page); pageProcessor.process(page);
addRequest(page); addRequest(page);
if (!page.getResultItems().isSkip()){ if (!page.getResultItems().isSkip()) {
for (Pipeline pipeline : pipelines) { for (Pipeline pipeline : pipelines) {
pipeline.process(page.getResultItems(), this); pipeline.process(page.getResultItems(), this);
} }
@ -298,8 +304,8 @@ public class Spider implements Runnable, Task {
return this; return this;
} }
public Spider clearPipeline(){ public Spider clearPipeline() {
pipelines=new ArrayList<Pipeline>(); pipelines = new ArrayList<Pipeline>();
return this; return this;
} }

@ -38,7 +38,7 @@ public class HttpClientDownloader implements Downloader {
* 便 * 便
* *
* @param url * @param url
* @return * @return html
*/ */
public Html download(String url) { public Html download(String url) {
Page page = download(new Request(url), null); Page page = download(new Request(url), null);

@ -2,9 +2,6 @@
<body> <body>
<div class="en"> <div class="en">
Main class "Spider" and models. Main class "Spider" and models.
</div>
<div class="zh">
包括webmagic入口类Spider和一些数据传递的实体类。
</div> </div>
</body> </body>
</html> </html>

@ -1,4 +1,4 @@
package us.codecraft.webmagic.model.annotation; package us.codecraft.webmagic.utils;
/** /**
* @author code4crafter@gmail.com <br> * @author code4crafter@gmail.com <br>

@ -1,6 +1,6 @@
package us.codecraft.webmagic; package us.codecraft.webmagic;
import us.codecraft.webmagic.model.annotation.Experimental; import us.codecraft.webmagic.utils.Experimental;
import java.util.Collection; import java.util.Collection;

@ -4,7 +4,7 @@ import org.apache.commons.codec.digest.DigestUtils;
import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.StringUtils;
import org.apache.log4j.Logger; import org.apache.log4j.Logger;
import us.codecraft.webmagic.*; import us.codecraft.webmagic.*;
import us.codecraft.webmagic.model.annotation.Experimental; import us.codecraft.webmagic.utils.Experimental;
import us.codecraft.webmagic.pipeline.Pipeline; import us.codecraft.webmagic.pipeline.Pipeline;
import us.codecraft.webmagic.processor.PageProcessor; import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.processor.SimplePageProcessor; import us.codecraft.webmagic.processor.SimplePageProcessor;

@ -1,6 +1,6 @@
package us.codecraft.webmagic.model; package us.codecraft.webmagic.model;
import us.codecraft.webmagic.model.annotation.Experimental; import us.codecraft.webmagic.utils.Experimental;
/** /**
* Interface to be implemented by page mode.<br> * Interface to be implemented by page mode.<br>

@ -21,7 +21,7 @@ public @interface ComboExtract {
*/ */
ExtractBy[] value(); ExtractBy[] value();
enum Op { public static enum Op {
/** /**
* All extractors will be arranged as a pipeline. <br> * All extractors will be arranged as a pipeline. <br>
* The next extractor uses the result of the previous as source. * The next extractor uses the result of the previous as source.
@ -49,7 +49,10 @@ public @interface ComboExtract {
*/ */
boolean notNull() default false; boolean notNull() default false;
public enum Source { /**
* types of source for extracting.
*/
public static enum Source {
/** /**
* extract from the content extracted by class extractor * extract from the content extracted by class extractor
*/ */

@ -21,7 +21,10 @@ public @interface ExtractBy {
*/ */
String value(); String value();
public enum Type {XPath, Regex, Css} /**
* types of extractor expressions
*/
public static enum Type {XPath, Regex, Css}
/** /**
* Extractor type, support XPath, CSS Selector and regex. * Extractor type, support XPath, CSS Selector and regex.
@ -38,7 +41,10 @@ public @interface ExtractBy {
*/ */
boolean notNull() default false; boolean notNull() default false;
public enum Source { /**
* types of source for extracting.
*/
public static enum Source {
/** /**
* extract from the content extracted by class extractor * extract from the content extracted by class extractor
*/ */

@ -3,7 +3,7 @@ package us.codecraft.webmagic.pipeline;
import us.codecraft.webmagic.MultiPageModel; import us.codecraft.webmagic.MultiPageModel;
import us.codecraft.webmagic.ResultItems; import us.codecraft.webmagic.ResultItems;
import us.codecraft.webmagic.Task; import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.model.annotation.Experimental; import us.codecraft.webmagic.utils.Experimental;
import us.codecraft.webmagic.utils.DoubleKeyMap; import us.codecraft.webmagic.utils.DoubleKeyMap;
import java.util.*; import java.util.*;

@ -16,7 +16,7 @@ import java.util.concurrent.atomic.AtomicBoolean;
import java.util.concurrent.atomic.AtomicInteger; import java.util.concurrent.atomic.AtomicInteger;
/** /**
* Store urls and cursor in files so that a Spider can resume the status when shutdown<br> * Store urls and cursor in files so that a Spider can resume the status when shutdown.<br>
* *
* @author code4crafter@gmail.com <br> * @author code4crafter@gmail.com <br>
* @since 0.2.0 * @since 0.2.0

Loading…
Cancel
Save