From f23e138c728f95622170990e732be9f77c6b1f17 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Mon, 27 Mar 2017 09:52:25 +0800 Subject: [PATCH] add response headers to Page #508 --- .../main/java/us/codecraft/webmagic/Page.java | 13 +++++++++ .../java/us/codecraft/webmagic/Request.java | 1 - .../main/java/us/codecraft/webmagic/Site.java | 10 ------- .../downloader/HttpClientDownloader.java | 13 ++++++--- .../downloader/HttpUriRequestConverter.java | 2 +- ...annedChecker.java => ResponseChecker.java} | 2 +- .../webmagic/utils/HttpClientUtils.java | 28 +++++++++++++++++++ 7 files changed, 52 insertions(+), 17 deletions(-) rename webmagic-core/src/main/java/us/codecraft/webmagic/proxy/{BannedChecker.java => ResponseChecker.java} (86%) create mode 100644 webmagic-core/src/main/java/us/codecraft/webmagic/utils/HttpClientUtils.java diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java index 7c0064d1..f9495a4a 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java @@ -8,6 +8,7 @@ import us.codecraft.webmagic.utils.UrlUtils; import java.util.ArrayList; import java.util.List; +import java.util.Map; /** * Object storing extracted result and urls to fetch.
@@ -38,6 +39,8 @@ public class Page { private Selectable url; + private Map> headers; + private int statusCode; private boolean needCycleRetry; @@ -210,6 +213,14 @@ public class Page { return this; } + public Map> getHeaders() { + return headers; + } + + public void setHeaders(Map> headers) { + this.headers = headers; + } + @Override public String toString() { return "Page{" + @@ -217,7 +228,9 @@ public class Page { ", resultItems=" + resultItems + ", rawText='" + rawText + '\'' + ", url=" + url + + ", headers=" + headers + ", statusCode=" + statusCode + + ", needCycleRetry=" + needCycleRetry + ", targetRequests=" + targetRequests + '}'; } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java index 0a38fcc6..21cd72e6 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java @@ -18,7 +18,6 @@ public class Request implements Serializable { private static final long serialVersionUID = 2062192774891352043L; public static final String CYCLE_TRIED_TIMES = "_cycle_tried_times"; - public static final String PROXY = "proxy"; private String url; diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java index 5606d122..520902db 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java @@ -39,16 +39,6 @@ public class Site { private boolean useGzip = true; - /** - * @see us.codecraft.webmagic.utils.HttpConstant.Header - * @deprecated - */ - public static interface HeaderConst { - - public static final String REFERER = "Referer"; - } - - static { DEFAULT_STATUS_CODE_SET.add(200); } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java index 3a44af65..e6523ec8 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java @@ -23,6 +23,7 @@ import us.codecraft.webmagic.proxy.Proxy; import us.codecraft.webmagic.proxy.ProxyProvider; import us.codecraft.webmagic.selector.PlainText; import us.codecraft.webmagic.utils.CharsetUtils; +import us.codecraft.webmagic.utils.HttpClientUtils; import java.io.IOException; import java.nio.charset.Charset; @@ -49,6 +50,8 @@ public class HttpClientDownloader extends AbstractDownloader { private ProxyProvider proxyProvider; + private boolean responseHeader = true; + public void setHttpUriRequestConverter(HttpUriRequestConverter httpUriRequestConverter) { this.httpUriRequestConverter = httpUriRequestConverter; } @@ -88,13 +91,12 @@ public class HttpClientDownloader extends AbstractDownloader { HttpContext httpContext = new BasicHttpContext(); if (proxyProvider != null) { proxy = proxyProvider.getProxy(task); - request.putExtra(Request.PROXY, proxy); AuthState authState = new AuthState(); authState.update(new BasicScheme(), new UsernamePasswordCredentials(proxy.getUsername(), proxy.getPassword())); httpContext.setAttribute(HttpClientContext.PROXY_AUTH_STATE, authState); } - HttpUriRequest httpUriRequest = httpUriRequestConverter.convert(request, site, proxy); CloseableHttpClient httpClient = getHttpClient(site); + HttpUriRequest httpUriRequest = httpUriRequestConverter.convert(request, site, proxy); try { httpResponse = httpClient.execute(httpUriRequest, httpContext); statusCode = httpResponse.getStatusLine().getStatusCode(); @@ -133,10 +135,13 @@ public class HttpClientDownloader extends AbstractDownloader { page.setUrl(new PlainText(request.getUrl())); page.setRequest(request); page.setStatusCode(httpResponse.getStatusLine().getStatusCode()); + if (responseHeader) { + page.setHeaders(HttpClientUtils.convertHeaders(httpResponse.getAllHeaders())); + } return page; } - protected String getContent(String charset, HttpResponse httpResponse) throws IOException { + private String getContent(String charset, HttpResponse httpResponse) throws IOException { if (charset == null) { byte[] contentBytes = IOUtils.toByteArray(httpResponse.getEntity().getContent()); String htmlCharset = getHtmlCharset(httpResponse, contentBytes); @@ -151,7 +156,7 @@ public class HttpClientDownloader extends AbstractDownloader { } } - protected String getHtmlCharset(HttpResponse httpResponse, byte[] contentBytes) throws IOException { + private String getHtmlCharset(HttpResponse httpResponse, byte[] contentBytes) throws IOException { return CharsetUtils.detectCharset(httpResponse.getEntity().getContentType().getValue(), contentBytes); } } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpUriRequestConverter.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpUriRequestConverter.java index 951d3323..db131d07 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpUriRequestConverter.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpUriRequestConverter.java @@ -39,7 +39,7 @@ public class HttpUriRequestConverter { requestConfigBuilder.setConnectionRequestTimeout(site.getTimeOut()) .setSocketTimeout(site.getTimeOut()) .setConnectTimeout(site.getTimeOut()) - .setCookieSpec(CookieSpecs.BEST_MATCH); + .setCookieSpec(CookieSpecs.STANDARD); } if (proxy != null) { diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/BannedChecker.java b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ResponseChecker.java similarity index 86% rename from webmagic-core/src/main/java/us/codecraft/webmagic/proxy/BannedChecker.java rename to webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ResponseChecker.java index db17de2b..3e68c116 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/BannedChecker.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ResponseChecker.java @@ -7,7 +7,7 @@ import org.apache.http.HttpResponse; * Date: 17/3/20 * Time: 下午10:52 */ -public interface BannedChecker { +public interface ResponseChecker { boolean isBanned(HttpResponse httpResponse); } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/utils/HttpClientUtils.java b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/HttpClientUtils.java new file mode 100644 index 00000000..93f8fe96 --- /dev/null +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/HttpClientUtils.java @@ -0,0 +1,28 @@ +package us.codecraft.webmagic.utils; + +import org.apache.http.Header; + +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +/** + * @author code4crafter@gmail.com + * Date: 17/3/27 + */ +public abstract class HttpClientUtils { + + public static Map> convertHeaders(Header[] headers){ + Map> results = new HashMap>(); + for (Header header : headers) { + List list = results.get(header.getName()); + if (list == null) { + list = new ArrayList(); + results.put(header.getName(), list); + } + list.add(header.getValue()); + } + return results; + } +}