From ec446277b139411112dc065281c5bb0417e06c32 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Tue, 15 Apr 2014 15:30:37 +0800 Subject: [PATCH] some refactor in httpclientdownloader --- .../downloader/HttpClientDownloader.java | 54 +++++++++++-------- .../webmagic/model/samples/GithubRepo.java | 7 +-- 2 files changed, 36 insertions(+), 25 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java index 30c561b5..f0f53c6c 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java @@ -7,6 +7,7 @@ import org.apache.http.annotation.ThreadSafe; import org.apache.http.client.config.CookieSpecs; import org.apache.http.client.config.RequestConfig; import org.apache.http.client.methods.CloseableHttpResponse; +import org.apache.http.client.methods.HttpUriRequest; import org.apache.http.client.methods.RequestBuilder; import org.apache.http.impl.client.CloseableHttpClient; import org.apache.http.util.EntityUtils; @@ -75,26 +76,12 @@ public class HttpClientDownloader extends AbstractDownloader { acceptStatCode = Sets.newHashSet(200); } logger.info("downloading page {}" , request.getUrl()); - RequestBuilder requestBuilder = RequestBuilder.get().setUri(request.getUrl()); - if (headers != null) { - for (Map.Entry headerEntry : headers.entrySet()) { - requestBuilder.addHeader(headerEntry.getKey(), headerEntry.getValue()); - } - } - RequestConfig.Builder requestConfigBuilder = RequestConfig.custom() - .setConnectionRequestTimeout(site.getTimeOut()) - .setSocketTimeout(site.getTimeOut()) - .setConnectTimeout(site.getTimeOut()) - .setCookieSpec(CookieSpecs.BEST_MATCH); - if (site != null && site.getHttpProxy() != null) { - requestConfigBuilder.setProxy(site.getHttpProxy()); - } - requestBuilder.setConfig(requestConfigBuilder.build()); CloseableHttpResponse httpResponse = null; try { - httpResponse = getHttpClient(site).execute(requestBuilder.build()); + HttpUriRequest httpUriRequest = getHttpUriRequest(request, site, headers); + httpResponse = getHttpClient(site).execute(httpUriRequest); int statusCode = httpResponse.getStatusLine().getStatusCode(); - if (acceptStatCode.contains(statusCode)) { + if (statusAccept(acceptStatCode, statusCode)) { //charset if (charset == null) { String value = httpResponse.getEntity().getContentType().getValue(); @@ -123,6 +110,34 @@ public class HttpClientDownloader extends AbstractDownloader { } } + @Override + public void setThread(int thread) { + httpClientGenerator.setPoolSize(thread); + } + + protected boolean statusAccept(Set acceptStatCode, int statusCode) { + return acceptStatCode.contains(statusCode); + } + + protected HttpUriRequest getHttpUriRequest(Request request, Site site, Map headers) { + RequestBuilder requestBuilder = RequestBuilder.get().setUri(request.getUrl()); + if (headers != null) { + for (Map.Entry headerEntry : headers.entrySet()) { + requestBuilder.addHeader(headerEntry.getKey(), headerEntry.getValue()); + } + } + RequestConfig.Builder requestConfigBuilder = RequestConfig.custom() + .setConnectionRequestTimeout(site.getTimeOut()) + .setSocketTimeout(site.getTimeOut()) + .setConnectTimeout(site.getTimeOut()) + .setCookieSpec(CookieSpecs.BEST_MATCH); + if (site != null && site.getHttpProxy() != null) { + requestConfigBuilder.setProxy(site.getHttpProxy()); + } + requestBuilder.setConfig(requestConfigBuilder.build()); + return requestBuilder.build(); + } + protected Page handleResponse(Request request, String charset, HttpResponse httpResponse, Task task) throws IOException { String content = IOUtils.toString(httpResponse.getEntity().getContent(), charset); Page page = new Page(); @@ -132,9 +147,4 @@ public class HttpClientDownloader extends AbstractDownloader { page.setStatusCode(httpResponse.getStatusLine().getStatusCode()); return page; } - - @Override - public void setThread(int thread) { - httpClientGenerator.setPoolSize(thread); - } } diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/GithubRepo.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/GithubRepo.java index 57de3f16..e8998eca 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/GithubRepo.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/GithubRepo.java @@ -3,9 +3,11 @@ package us.codecraft.webmagic.model.samples; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.model.HasKey; import us.codecraft.webmagic.model.OOSpider; -import us.codecraft.webmagic.model.annotation.*; +import us.codecraft.webmagic.model.annotation.ExtractBy; +import us.codecraft.webmagic.model.annotation.ExtractByUrl; +import us.codecraft.webmagic.model.annotation.HelpUrl; +import us.codecraft.webmagic.model.annotation.TargetUrl; import us.codecraft.webmagic.pipeline.JsonFilePageModelPipeline; -import us.codecraft.webmagic.samples.formatter.StringTemplateFormatter; import us.codecraft.webmagic.scheduler.FileCacheQueueScheduler; import java.util.List; @@ -20,7 +22,6 @@ public class GithubRepo implements HasKey { @ExtractBy(value = "//h1[@class='entry-title public']/strong/a/text()", notNull = true) private String name; - @Formatter(value = "author%s",formatter = StringTemplateFormatter.class) @ExtractByUrl("https://github\\.com/(\\w+)/.*") private String author;