From d141541ef30bc6a9b12a9432bd9a5795008f3d10 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Wed, 4 Sep 2013 09:57:19 +0800 Subject: [PATCH] add retry --- .../java/us/codecraft/webmagic/Request.java | 2 ++ .../main/java/us/codecraft/webmagic/Site.java | 23 ++++++++++++++++++- .../downloader/HttpClientDownloader.java | 17 +++++++++++++- .../us/codecraft/webmagic/selector/Html.java | 11 +++++++-- .../webmagic/scheduler/RedisScheduler.java | 8 ++++--- 5 files changed, 54 insertions(+), 7 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java index 694d32b2..142a20c7 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java @@ -17,6 +17,8 @@ public class Request implements Serializable { private static final long serialVersionUID = 2062192774891352043L; + public static final String CYCLE_TRIED_TIMES = "_cycle_tried_times"; + private String url; /** diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java index 443f2bba..6a351786 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java @@ -30,6 +30,8 @@ public class Site { private int retryTimes = 0; + private int cycleRetryTimes = 0; + private static final Set DEFAULT_STATUS_CODE_SET = new HashSet(); private Set acceptStatCode = DEFAULT_STATUS_CODE_SET; @@ -200,7 +202,7 @@ public class Site { } /** - * Get retry times when download fail, 0 by default.
+ * Get retry times when download fail immediately, 0 by default.
* * @return retry times when download fail */ @@ -218,6 +220,25 @@ public class Site { return this; } + /** + * When cycleRetryTimes is more than 0, it will add back to scheduler and try download again.
+ * + * @return retry times when download fail + */ + public int getCycleRetryTimes() { + return cycleRetryTimes; + } + + /** + * Set cycleRetryTimes times when download fail, 0 by default. Only work in RedisScheduler.
+ * + * @return this + */ + public Site setCycleRetryTimes(int cycleRetryTimes) { + this.cycleRetryTimes = cycleRetryTimes; + return this; + } + @Override public boolean equals(Object o) { if (this == o) return true; diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java index 7a063298..82a4a9a7 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java @@ -52,7 +52,7 @@ public class HttpClientDownloader implements Downloader { * @param url * @return html */ - public Html download(String url,String charset) { + public Html download(String url, String charset) { Page page = download(new Request(url), Site.me().setCharset(charset).toTask()); return (Html) page.getHtml(); } @@ -90,6 +90,21 @@ public class HttpClientDownloader implements Downloader { if (tried > retryTimes) { logger.warn("download page " + request.getUrl() + " error", e); + if (site.getCycleRetryTimes() > 0) { + Page page = new Page(); + Object cycleTriedTimesObject = request.getExtra(Request.CYCLE_TRIED_TIMES); + if (cycleTriedTimesObject == null) { + page.addTargetRequest(request.setPriority(0).putExtra(Request.CYCLE_TRIED_TIMES, 1)); + } else { + int cycleTriedTimes = (Integer) cycleTriedTimesObject; + cycleTriedTimes++; + if (cycleTriedTimes >= site.getCycleRetryTimes()) { + return null; + } + page.addTargetRequest(request.setPriority(0).putExtra(Request.CYCLE_TRIED_TIMES, 1)); + } + return page; + } return null; } logger.info("download page " + request.getUrl() + " error, retry the " + tried + " time!"); diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java index 17988249..b9b7f02b 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java @@ -1,5 +1,6 @@ package us.codecraft.webmagic.selector; +import org.apache.log4j.Logger; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import us.codecraft.webmagic.utils.EnvironmentUtil; @@ -15,6 +16,8 @@ import java.util.List; */ public class Html extends PlainText { + private Logger logger = Logger.getLogger(getClass()); + /** * Store parsed document for better performance when only one text exist. */ @@ -26,7 +29,11 @@ public class Html extends PlainText { public Html(String text) { super(text); - this.document = Jsoup.parse(text); + try { + this.document = Jsoup.parse(text); + } catch (Exception e) { + logger.warn("parse document error ", e); + } } public Html(Document document) { @@ -108,7 +115,7 @@ public class Html extends PlainText { } public String getText() { - if (strings!=null&&strings.size()>0){ + if (strings != null && strings.size() > 0) { return strings.get(0); } return document.html(); diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java index e1916279..cd906255 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java @@ -36,9 +36,11 @@ public class RedisScheduler implements Scheduler { public synchronized void push(Request request, Task task) { Jedis jedis = pool.getResource(); try { - //使用Set进行url去重 - if (!jedis.sismember(SET_PREFIX + task.getUUID(), request.getUrl())) { - //使用List保存队列 + // if cycleRetriedTimes is set, allow duplicated. + Object cycleRetriedTimes = request.getExtra(Request.CYCLE_TRIED_TIMES); + // use set to remove duplicate url + if (cycleRetriedTimes != null || !jedis.sismember(SET_PREFIX + task.getUUID(), request.getUrl())) { + // use list to store queue jedis.rpush(QUEUE_PREFIX + task.getUUID(), request.getUrl()); jedis.sadd(SET_PREFIX + task.getUUID(), request.getUrl()); if (request.getExtras() != null) {