add retry

pull/23/merge
yihua.huang 12 years ago
parent a1ef2523cc
commit d141541ef3

@ -17,6 +17,8 @@ public class Request implements Serializable {
private static final long serialVersionUID = 2062192774891352043L;
public static final String CYCLE_TRIED_TIMES = "_cycle_tried_times";
private String url;
/**

@ -30,6 +30,8 @@ public class Site {
private int retryTimes = 0;
private int cycleRetryTimes = 0;
private static final Set<Integer> DEFAULT_STATUS_CODE_SET = new HashSet<Integer>();
private Set<Integer> acceptStatCode = DEFAULT_STATUS_CODE_SET;
@ -200,7 +202,7 @@ public class Site {
}
/**
* Get retry times when download fail, 0 by default.<br>
* Get retry times when download fail immediately, 0 by default.<br>
*
* @return retry times when download fail
*/
@ -218,6 +220,25 @@ public class Site {
return this;
}
/**
* When cycleRetryTimes is more than 0, it will add back to scheduler and try download again. <br>
*
* @return retry times when download fail
*/
public int getCycleRetryTimes() {
return cycleRetryTimes;
}
/**
* Set cycleRetryTimes times when download fail, 0 by default. Only work in RedisScheduler. <br>
*
* @return this
*/
public Site setCycleRetryTimes(int cycleRetryTimes) {
this.cycleRetryTimes = cycleRetryTimes;
return this;
}
@Override
public boolean equals(Object o) {
if (this == o) return true;

@ -52,7 +52,7 @@ public class HttpClientDownloader implements Downloader {
* @param url
* @return html
*/
public Html download(String url,String charset) {
public Html download(String url, String charset) {
Page page = download(new Request(url), Site.me().setCharset(charset).toTask());
return (Html) page.getHtml();
}
@ -90,6 +90,21 @@ public class HttpClientDownloader implements Downloader {
if (tried > retryTimes) {
logger.warn("download page " + request.getUrl() + " error", e);
if (site.getCycleRetryTimes() > 0) {
Page page = new Page();
Object cycleTriedTimesObject = request.getExtra(Request.CYCLE_TRIED_TIMES);
if (cycleTriedTimesObject == null) {
page.addTargetRequest(request.setPriority(0).putExtra(Request.CYCLE_TRIED_TIMES, 1));
} else {
int cycleTriedTimes = (Integer) cycleTriedTimesObject;
cycleTriedTimes++;
if (cycleTriedTimes >= site.getCycleRetryTimes()) {
return null;
}
page.addTargetRequest(request.setPriority(0).putExtra(Request.CYCLE_TRIED_TIMES, 1));
}
return page;
}
return null;
}
logger.info("download page " + request.getUrl() + " error, retry the " + tried + " time!");

@ -1,5 +1,6 @@
package us.codecraft.webmagic.selector;
import org.apache.log4j.Logger;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import us.codecraft.webmagic.utils.EnvironmentUtil;
@ -15,6 +16,8 @@ import java.util.List;
*/
public class Html extends PlainText {
private Logger logger = Logger.getLogger(getClass());
/**
* Store parsed document for better performance when only one text exist.
*/
@ -26,7 +29,11 @@ public class Html extends PlainText {
public Html(String text) {
super(text);
this.document = Jsoup.parse(text);
try {
this.document = Jsoup.parse(text);
} catch (Exception e) {
logger.warn("parse document error ", e);
}
}
public Html(Document document) {
@ -108,7 +115,7 @@ public class Html extends PlainText {
}
public String getText() {
if (strings!=null&&strings.size()>0){
if (strings != null && strings.size() > 0) {
return strings.get(0);
}
return document.html();

@ -36,9 +36,11 @@ public class RedisScheduler implements Scheduler {
public synchronized void push(Request request, Task task) {
Jedis jedis = pool.getResource();
try {
//使用Set进行url去重
if (!jedis.sismember(SET_PREFIX + task.getUUID(), request.getUrl())) {
//使用List保存队列
// if cycleRetriedTimes is set, allow duplicated.
Object cycleRetriedTimes = request.getExtra(Request.CYCLE_TRIED_TIMES);
// use set to remove duplicate url
if (cycleRetriedTimes != null || !jedis.sismember(SET_PREFIX + task.getUUID(), request.getUrl())) {
// use list to store queue
jedis.rpush(QUEUE_PREFIX + task.getUUID(), request.getUrl());
jedis.sadd(SET_PREFIX + task.getUUID(), request.getUrl());
if (request.getExtras() != null) {

Loading…
Cancel
Save