diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java
index 7c0064d1..f9495a4a 100644
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java
@@ -8,6 +8,7 @@ import us.codecraft.webmagic.utils.UrlUtils;
import java.util.ArrayList;
import java.util.List;
+import java.util.Map;
/**
* Object storing extracted result and urls to fetch.
@@ -38,6 +39,8 @@ public class Page {
private Selectable url;
+ private Map> headers;
+
private int statusCode;
private boolean needCycleRetry;
@@ -210,6 +213,14 @@ public class Page {
return this;
}
+ public Map> getHeaders() {
+ return headers;
+ }
+
+ public void setHeaders(Map> headers) {
+ this.headers = headers;
+ }
+
@Override
public String toString() {
return "Page{" +
@@ -217,7 +228,9 @@ public class Page {
", resultItems=" + resultItems +
", rawText='" + rawText + '\'' +
", url=" + url +
+ ", headers=" + headers +
", statusCode=" + statusCode +
+ ", needCycleRetry=" + needCycleRetry +
", targetRequests=" + targetRequests +
'}';
}
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java
index 0a38fcc6..21cd72e6 100644
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java
@@ -18,7 +18,6 @@ public class Request implements Serializable {
private static final long serialVersionUID = 2062192774891352043L;
public static final String CYCLE_TRIED_TIMES = "_cycle_tried_times";
- public static final String PROXY = "proxy";
private String url;
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java
index 5606d122..520902db 100644
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java
@@ -39,16 +39,6 @@ public class Site {
private boolean useGzip = true;
- /**
- * @see us.codecraft.webmagic.utils.HttpConstant.Header
- * @deprecated
- */
- public static interface HeaderConst {
-
- public static final String REFERER = "Referer";
- }
-
-
static {
DEFAULT_STATUS_CODE_SET.add(200);
}
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java
index 3a44af65..e6523ec8 100644
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java
@@ -23,6 +23,7 @@ import us.codecraft.webmagic.proxy.Proxy;
import us.codecraft.webmagic.proxy.ProxyProvider;
import us.codecraft.webmagic.selector.PlainText;
import us.codecraft.webmagic.utils.CharsetUtils;
+import us.codecraft.webmagic.utils.HttpClientUtils;
import java.io.IOException;
import java.nio.charset.Charset;
@@ -49,6 +50,8 @@ public class HttpClientDownloader extends AbstractDownloader {
private ProxyProvider proxyProvider;
+ private boolean responseHeader = true;
+
public void setHttpUriRequestConverter(HttpUriRequestConverter httpUriRequestConverter) {
this.httpUriRequestConverter = httpUriRequestConverter;
}
@@ -88,13 +91,12 @@ public class HttpClientDownloader extends AbstractDownloader {
HttpContext httpContext = new BasicHttpContext();
if (proxyProvider != null) {
proxy = proxyProvider.getProxy(task);
- request.putExtra(Request.PROXY, proxy);
AuthState authState = new AuthState();
authState.update(new BasicScheme(), new UsernamePasswordCredentials(proxy.getUsername(), proxy.getPassword()));
httpContext.setAttribute(HttpClientContext.PROXY_AUTH_STATE, authState);
}
- HttpUriRequest httpUriRequest = httpUriRequestConverter.convert(request, site, proxy);
CloseableHttpClient httpClient = getHttpClient(site);
+ HttpUriRequest httpUriRequest = httpUriRequestConverter.convert(request, site, proxy);
try {
httpResponse = httpClient.execute(httpUriRequest, httpContext);
statusCode = httpResponse.getStatusLine().getStatusCode();
@@ -133,10 +135,13 @@ public class HttpClientDownloader extends AbstractDownloader {
page.setUrl(new PlainText(request.getUrl()));
page.setRequest(request);
page.setStatusCode(httpResponse.getStatusLine().getStatusCode());
+ if (responseHeader) {
+ page.setHeaders(HttpClientUtils.convertHeaders(httpResponse.getAllHeaders()));
+ }
return page;
}
- protected String getContent(String charset, HttpResponse httpResponse) throws IOException {
+ private String getContent(String charset, HttpResponse httpResponse) throws IOException {
if (charset == null) {
byte[] contentBytes = IOUtils.toByteArray(httpResponse.getEntity().getContent());
String htmlCharset = getHtmlCharset(httpResponse, contentBytes);
@@ -151,7 +156,7 @@ public class HttpClientDownloader extends AbstractDownloader {
}
}
- protected String getHtmlCharset(HttpResponse httpResponse, byte[] contentBytes) throws IOException {
+ private String getHtmlCharset(HttpResponse httpResponse, byte[] contentBytes) throws IOException {
return CharsetUtils.detectCharset(httpResponse.getEntity().getContentType().getValue(), contentBytes);
}
}
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpUriRequestConverter.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpUriRequestConverter.java
index 951d3323..db131d07 100644
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpUriRequestConverter.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpUriRequestConverter.java
@@ -39,7 +39,7 @@ public class HttpUriRequestConverter {
requestConfigBuilder.setConnectionRequestTimeout(site.getTimeOut())
.setSocketTimeout(site.getTimeOut())
.setConnectTimeout(site.getTimeOut())
- .setCookieSpec(CookieSpecs.BEST_MATCH);
+ .setCookieSpec(CookieSpecs.STANDARD);
}
if (proxy != null) {
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/BannedChecker.java b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ResponseChecker.java
similarity index 86%
rename from webmagic-core/src/main/java/us/codecraft/webmagic/proxy/BannedChecker.java
rename to webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ResponseChecker.java
index db17de2b..3e68c116 100644
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/BannedChecker.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ResponseChecker.java
@@ -7,7 +7,7 @@ import org.apache.http.HttpResponse;
* Date: 17/3/20
* Time: 下午10:52
*/
-public interface BannedChecker {
+public interface ResponseChecker {
boolean isBanned(HttpResponse httpResponse);
}
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/utils/HttpClientUtils.java b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/HttpClientUtils.java
new file mode 100644
index 00000000..93f8fe96
--- /dev/null
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/HttpClientUtils.java
@@ -0,0 +1,28 @@
+package us.codecraft.webmagic.utils;
+
+import org.apache.http.Header;
+
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+/**
+ * @author code4crafter@gmail.com
+ * Date: 17/3/27
+ */
+public abstract class HttpClientUtils {
+
+ public static Map> convertHeaders(Header[] headers){
+ Map> results = new HashMap>();
+ for (Header header : headers) {
+ List list = results.get(header.getName());
+ if (list == null) {
+ list = new ArrayList();
+ results.put(header.getName(), list);
+ }
+ list.add(header.getValue());
+ }
+ return results;
+ }
+}