update comments

pull/17/head
yihua.huang 12 years ago
parent 6cc1d62a08
commit 5f1f4cbc46

@ -8,30 +8,19 @@ import java.util.ArrayList;
import java.util.List;
/**
* <pre class="zh">
* Page
*
*
* {@link #getUrl()} Url
* {@link #getHtml()} html
* {@link #putField(String, Object)}
* {@link #getResultItems()} {@link us.codecraft.webmagic.pipeline.Pipeline}
* {@link #addTargetRequests(java.util.List)} {@link #addTargetRequest(String)}
*
* </pre>
* <pre class="en">
* Store extracted result and urls to be crawled.
*
* Main method
* {@link #getUrl()} get url of current page
* {@link #getHtml()} get content of current page
* {@link #putField(String, Object)} save extracted result
* {@link #getResultItems()} get extract results to be used in {@link us.codecraft.webmagic.pipeline.Pipeline}
* {@link #addTargetRequests(java.util.List)} {@link #addTargetRequest(String)} add urls to crawl
*
* </pre>
* Object storing extracted result and urls to be crawled.<br>
* Main method <br>
* {@link #getUrl()} get url of current page <br>
* {@link #getHtml()} get content of current page <br>
* {@link #putField(String, Object)} save extracted result <br>
* {@link #getResultItems()} get extract results to be used in {@link us.codecraft.webmagic.pipeline.Pipeline}<br>
* {@link #addTargetRequests(java.util.List)} {@link #addTargetRequest(String)} add urls to crawl <br>
*
* @author code4crafter@gmail.com <br>
* @since 0.1.0
* @see us.codecraft.webmagic.downloader.Downloader
* @see us.codecraft.webmagic.processor.PageProcessor
*/
public class Page {
@ -55,19 +44,19 @@ public class Page {
}
/**
* store extract results
*
*
* @param key key
* @param field value
* @param key
* @param field
*/
public void putField(String key, Object field) {
resultItems.put(key, field);
}
/**
* html
* get html content of page
*
* @return html html
* @return html
*/
public Selectable getHtml() {
return html;
@ -82,9 +71,9 @@ public class Page {
}
/**
*
* add urls to crawl
*
* @param requests
* @param requests
*/
public void addTargetRequests(List<String> requests) {
synchronized (targetRequests) {
@ -99,9 +88,9 @@ public class Page {
}
/**
*
* add url to crawl
*
* @param requestString
* @param requestString
*/
public void addTargetRequest(String requestString) {
if (StringUtils.isBlank(requestString) || requestString.equals("#")) {
@ -114,9 +103,9 @@ public class Page {
}
/**
* 使
* add requests to crawl
*
* @param request
* @param request
*/
public void addTargetRequest(Request request) {
synchronized (targetRequests) {
@ -125,27 +114,22 @@ public class Page {
}
/**
* Url
* get url of current page
*
* @return url url
* @return url of current page
*/
public Selectable getUrl() {
return url;
}
/**
* url
*
* @param url
*/
public void setUrl(Selectable url) {
this.url = url;
}
/**
*
* get request of current page
*
* @return request
* @return request
*/
public Request getRequest() {
return request;

@ -1,33 +1,17 @@
package us.codecraft.webmagic;
import us.codecraft.webmagic.utils.Experimental;
import java.io.Serializable;
import java.util.HashMap;
import java.util.Map;
/**
* <div class="zh">
* Requesturl<br/>
* PageProcessorRequest{@link us.codecraft.webmagic.Page#getRequest()} <br/>
* <br/>
* Requestextra<br/>
* <pre>
* Example:
* <a href="${link}">${linktext}</a>linklinktext
*
* public void process(Page page){
* Request request = new Request(link,linktext);
* page.addTargetRequest(request)
* }
*
* public void process(Page page){
* String linktext = (String)page.getRequest().getExtra()[0];
* }
* </pre>
* </div>
* Object contains url to crawl.<br>
* It contains some additional information.<br>
*
* @author code4crafter@gmail.com <br>
* Date: 13-4-21
* Time: 11:37
* @since 0.1.0
*/
public class Request implements Serializable {
@ -36,20 +20,22 @@ public class Request implements Serializable {
private String url;
/**
*
* Store additional information in extras.
*/
private Map<String, Object> extras;
/**
* Priority of the request.<br>
* The bigger will be processed earlier. <br>
* Need a scheduler supporting priority.<br>
* But no scheduler in webmagic supporting priority now (:
*/
@Experimental
private double priority;
public Request() {
}
/**
* request
*
* @param url url
*/
public Request(String url) {
this.url = url;
}
@ -59,12 +45,14 @@ public class Request implements Serializable {
}
/**
* URL<br>
* Scheduler<br>
* Scheduler == <br>
* @param priority
* Set the priority of request for sorting.<br>
* Need a scheduler supporting priority.<br>
* But no scheduler in webmagic supporting priority now (:
*
* @param priority
* @return this
*/
@Experimental
public Request setPriority(double priority) {
this.priority = priority;
return this;
@ -85,11 +73,6 @@ public class Request implements Serializable {
return this;
}
/**
* url
*
* @return url url
*/
public String getUrl() {
return url;
}

@ -4,10 +4,13 @@ import java.util.HashMap;
import java.util.Map;
/**
* PageProcessor{@link us.codecraft.webmagic.pipeline.Pipeline}<br>
* Object contains extract results.<br>
* It is contained in Page and will be processed in pipeline.
*
* @author code4crafter@gmail.com <br>
* Date: 13-7-25 <br>
* Time: 12:20 <br>
* @since 0.1.0
* @see Page
* @see us.codecraft.webmagic.pipeline.Pipeline
*/
public class ResultItems {
@ -25,7 +28,7 @@ public class ResultItems {
return (T) fields.get(key);
}
public Map<String, Object> getAll() {
public Map<String, Object> getAll() {
return fields;
}
@ -44,8 +47,10 @@ public class ResultItems {
}
/**
* pipeline
* @return true
* Whether to skip the result.<br>
* Result which is skipped will not be processed by Pipeline.
*
* @return whether to skip the result
*/
public boolean isSkip() {
return skip;
@ -53,8 +58,10 @@ public class ResultItems {
/**
* pipeline
* @param skip
* Set whether to skip the result.<br>
* Result which is skipped will not be processed by Pipeline.
*
* @param skip whether to skip the result
* @return this
*/
public ResultItems setSkip(boolean skip) {

@ -5,12 +5,11 @@ import us.codecraft.webmagic.utils.UrlUtils;
import java.util.*;
/**
* Site<br>
* getter<br>
* Object contains setting for crawler.<br>
*
* @author code4crafter@gmail.com <br>
* Date: 13-4-21
* Time: 12:13
* @since 0.1.0
* @see us.codecraft.webmagic.processor.PageProcessor
*/
public class Site {
@ -22,6 +21,9 @@ public class Site {
private String charset;
/**
* startUrls is the urls the crawler to start with.
*/
private List<String> startUrls = new ArrayList<String>();
private int sleepTime = 3000;
@ -37,19 +39,19 @@ public class Site {
}
/**
* Sitenew Site()
* new a Site
*
* @return
* @return new site
*/
public static Site me() {
return new Site();
}
/**
* cookie访cookie{@link #getDomain()}
* Add a cookie with domain {@link #getDomain()}
*
* @param name cookie
* @param value cookie
* @param name
* @param value
* @return this
*/
public Site addCookie(String name, String value) {
@ -58,7 +60,7 @@ public class Site {
}
/**
* user-agentuser-agent
* set user agent
*
* @param userAgent userAgent
* @return this
@ -69,27 +71,27 @@ public class Site {
}
/**
* cookie
* get cookies
*
* @return cookie
* @return get cookies
*/
public Map<String, String> getCookies() {
return cookies;
}
/**
* user-agent
* get user agent
*
* @return user-agent
* @return user agent
*/
public String getUserAgent() {
return userAgent;
}
/**
* domain
* get domain
*
* @return domain
* @return get domain
*/
public String getDomain() {
if (domain == null) {
@ -101,10 +103,9 @@ public class Site {
}
/**
* <br>
* Spider
* set the domain of site.
*
* @param domain
* @param domain
* @return this
*/
public Site setDomain(String domain) {
@ -113,10 +114,10 @@ public class Site {
}
/**
* Html meta<br>
* encoding<br>
* Set charset of page manually.<br>
* When charset is not set or set to null, it can be auto detected by Http header.
*
* @param charset "utf-8""gbk"
* @param charset
* @return this
*/
public Site setCharset(String charset) {
@ -125,20 +126,21 @@ public class Site {
}
/**
*
* get charset set manually
*
* @return domain
* @return charset
*/
public String getCharset() {
return charset;
}
/**
* http<br>
* 200<br>
* <br>
* Set acceptStatCode.<br>
* When status code of http response is in acceptStatCodes, it will be processed.<br>
* {200} by default.<br>
* It is not necessarily to be set.<br>
*
* @param acceptStatCode
* @param acceptStatCode
* @return this
*/
public Site setAcceptStatCode(Set<Integer> acceptStatCode) {
@ -147,27 +149,27 @@ public class Site {
}
/**
*
* get acceptStatCode
*
* @return
* @return acceptStatCode
*/
public Set<Integer> getAcceptStatCode() {
return acceptStatCode;
}
/**
*
* get start urls
*
* @return
* @return start urls
*/
public List<String> getStartUrls() {
return startUrls;
}
/**
*
* Add a url to start url.<br>
*
* @param startUrl
* @param startUrl
* @return this
*/
public Site addStartUrl(String startUrl) {
@ -176,9 +178,10 @@ public class Site {
}
/**
* (...)
* Set the interval between the processing of two pages.<br>
* Time unit is micro seconds.<br>
*
* @param sleepTime
* @param sleepTime
* @return this
*/
public Site setSleepTime(int sleepTime) {
@ -187,25 +190,26 @@ public class Site {
}
/**
*
* Get the interval between the processing of two pages.<br>
* Time unit is micro seconds.<br>
*
* @return
* @return the interval between the processing of two pages,
*/
public int getSleepTime() {
return sleepTime;
}
/**
* 0
* Get retry times when download fail, 0 by default.<br>
*
* @return
* @return retry times when download fail
*/
public int getRetryTimes() {
return retryTimes;
}
/**
* 0
* Set retry times when download fail, 0 by default.<br>
*
* @return this
*/

@ -18,25 +18,30 @@ import java.util.concurrent.ExecutorService;
import java.util.concurrent.atomic.AtomicInteger;
/**
* <pre>
* webmagic
*
*
*
* Spider.create(new SimplePageProcessor("http://my.oschina.net/", "http://my.oschina.net/*blog/*")).run();
*
* 使FilePipeline:
* Spider.create(new SimplePageProcessor("http://my.oschina.net/", "http://my.oschina.net/*blog/*"))
* .pipeline(new FilePipeline("/data/temp/webmagic/")).run();
*
* 使FileCacheQueueSchedulerURL:
* Spider.create(new SimplePageProcessor("http://my.oschina.net/", "http://my.oschina.net/*blog/*"))
* .scheduler(new FileCacheQueueScheduler("/data/temp/webmagic/cache/")).run();
* </pre>
* Entrance of a crawler.<br>
* A spider contains four modules: Downloader, Scheduler, PageProcessor and Pipeline.<br>
* Every module is a field of Spider. <br>
* The modules are defined in interface. <br>
* You can customize a spider with various implementations of them. <br>
* Examples: <br>
* <br>
* A simple crawler: <br>
* Spider.create(new SimplePageProcessor("http://my.oschina.net/", "http://my.oschina.net/*blog/*")).run();<br>
* <br>
* Store results to files by FilePipeline: <br>
* Spider.create(new SimplePageProcessor("http://my.oschina.net/", "http://my.oschina.net/*blog/*")) <br>
* .pipeline(new FilePipeline("/data/temp/webmagic/")).run(); <br>
* <br>
* Use FileCacheQueueScheduler to store urls and cursor in files, so that a Spider can resume the status when shutdown. <br>
* Spider.create(new SimplePageProcessor("http://my.oschina.net/", "http://my.oschina.net/*blog/*")) <br>
* .scheduler(new FileCacheQueueScheduler("/data/temp/webmagic/cache/")).run(); <br>
*
* @author code4crafter@gmail.com <br>
* Date: 13-4-21
* Time: 6:53
* @see Downloader
* @see Scheduler
* @see PageProcessor
* @see Pipeline
* @since 0.1.0
*/
public class Spider implements Runnable, Task {
@ -222,11 +227,12 @@ public class Spider implements Runnable, Task {
/**
* URL
*
* @param urls url
*/
public void test(String... urls){
public void test(String... urls) {
checkComponent();
if (urls.length>0){
if (urls.length > 0) {
for (String url : urls) {
processRequest(new Request(url));
}
@ -241,7 +247,7 @@ public class Spider implements Runnable, Task {
}
pageProcessor.process(page);
addRequest(page);
if (!page.getResultItems().isSkip()){
if (!page.getResultItems().isSkip()) {
for (Pipeline pipeline : pipelines) {
pipeline.process(page.getResultItems(), this);
}
@ -298,8 +304,8 @@ public class Spider implements Runnable, Task {
return this;
}
public Spider clearPipeline(){
pipelines=new ArrayList<Pipeline>();
public Spider clearPipeline() {
pipelines = new ArrayList<Pipeline>();
return this;
}

@ -38,7 +38,7 @@ public class HttpClientDownloader implements Downloader {
* 便
*
* @param url
* @return
* @return html
*/
public Html download(String url) {
Page page = download(new Request(url), null);

@ -2,9 +2,6 @@
<body>
<div class="en">
Main class "Spider" and models.
</div>
<div class="zh">
包括webmagic入口类Spider和一些数据传递的实体类。
</div>
</body>
</html>

@ -1,4 +1,4 @@
package us.codecraft.webmagic.model.annotation;
package us.codecraft.webmagic.utils;
/**
* @author code4crafter@gmail.com <br>

@ -1,6 +1,6 @@
package us.codecraft.webmagic;
import us.codecraft.webmagic.model.annotation.Experimental;
import us.codecraft.webmagic.utils.Experimental;
import java.util.Collection;

@ -4,7 +4,7 @@ import org.apache.commons.codec.digest.DigestUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.log4j.Logger;
import us.codecraft.webmagic.*;
import us.codecraft.webmagic.model.annotation.Experimental;
import us.codecraft.webmagic.utils.Experimental;
import us.codecraft.webmagic.pipeline.Pipeline;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.processor.SimplePageProcessor;

@ -1,6 +1,6 @@
package us.codecraft.webmagic.model;
import us.codecraft.webmagic.model.annotation.Experimental;
import us.codecraft.webmagic.utils.Experimental;
/**
* Interface to be implemented by page mode.<br>

@ -21,7 +21,7 @@ public @interface ComboExtract {
*/
ExtractBy[] value();
enum Op {
public static enum Op {
/**
* All extractors will be arranged as a pipeline. <br>
* The next extractor uses the result of the previous as source.
@ -49,7 +49,10 @@ public @interface ComboExtract {
*/
boolean notNull() default false;
public enum Source {
/**
* types of source for extracting.
*/
public static enum Source {
/**
* extract from the content extracted by class extractor
*/

@ -21,7 +21,10 @@ public @interface ExtractBy {
*/
String value();
public enum Type {XPath, Regex, Css}
/**
* types of extractor expressions
*/
public static enum Type {XPath, Regex, Css}
/**
* Extractor type, support XPath, CSS Selector and regex.
@ -38,7 +41,10 @@ public @interface ExtractBy {
*/
boolean notNull() default false;
public enum Source {
/**
* types of source for extracting.
*/
public static enum Source {
/**
* extract from the content extracted by class extractor
*/

@ -3,7 +3,7 @@ package us.codecraft.webmagic.pipeline;
import us.codecraft.webmagic.MultiPageModel;
import us.codecraft.webmagic.ResultItems;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.model.annotation.Experimental;
import us.codecraft.webmagic.utils.Experimental;
import us.codecraft.webmagic.utils.DoubleKeyMap;
import java.util.*;

@ -16,7 +16,7 @@ import java.util.concurrent.atomic.AtomicBoolean;
import java.util.concurrent.atomic.AtomicInteger;
/**
* Store urls and cursor in files so that a Spider can resume the status when shutdown<br>
* Store urls and cursor in files so that a Spider can resume the status when shutdown.<br>
*
* @author code4crafter@gmail.com <br>
* @since 0.2.0

Loading…
Cancel
Save