add response headers to Page #508

pull/524/head
yihua.huang 8 years ago
parent ba000b364c
commit f23e138c72

@ -8,6 +8,7 @@ import us.codecraft.webmagic.utils.UrlUtils;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
/**
* Object storing extracted result and urls to fetch.<br>
@ -38,6 +39,8 @@ public class Page {
private Selectable url;
private Map<String,List<String>> headers;
private int statusCode;
private boolean needCycleRetry;
@ -210,6 +213,14 @@ public class Page {
return this;
}
public Map<String, List<String>> getHeaders() {
return headers;
}
public void setHeaders(Map<String, List<String>> headers) {
this.headers = headers;
}
@Override
public String toString() {
return "Page{" +
@ -217,7 +228,9 @@ public class Page {
", resultItems=" + resultItems +
", rawText='" + rawText + '\'' +
", url=" + url +
", headers=" + headers +
", statusCode=" + statusCode +
", needCycleRetry=" + needCycleRetry +
", targetRequests=" + targetRequests +
'}';
}

@ -18,7 +18,6 @@ public class Request implements Serializable {
private static final long serialVersionUID = 2062192774891352043L;
public static final String CYCLE_TRIED_TIMES = "_cycle_tried_times";
public static final String PROXY = "proxy";
private String url;

@ -39,16 +39,6 @@ public class Site {
private boolean useGzip = true;
/**
* @see us.codecraft.webmagic.utils.HttpConstant.Header
* @deprecated
*/
public static interface HeaderConst {
public static final String REFERER = "Referer";
}
static {
DEFAULT_STATUS_CODE_SET.add(200);
}

@ -23,6 +23,7 @@ import us.codecraft.webmagic.proxy.Proxy;
import us.codecraft.webmagic.proxy.ProxyProvider;
import us.codecraft.webmagic.selector.PlainText;
import us.codecraft.webmagic.utils.CharsetUtils;
import us.codecraft.webmagic.utils.HttpClientUtils;
import java.io.IOException;
import java.nio.charset.Charset;
@ -49,6 +50,8 @@ public class HttpClientDownloader extends AbstractDownloader {
private ProxyProvider proxyProvider;
private boolean responseHeader = true;
public void setHttpUriRequestConverter(HttpUriRequestConverter httpUriRequestConverter) {
this.httpUriRequestConverter = httpUriRequestConverter;
}
@ -88,13 +91,12 @@ public class HttpClientDownloader extends AbstractDownloader {
HttpContext httpContext = new BasicHttpContext();
if (proxyProvider != null) {
proxy = proxyProvider.getProxy(task);
request.putExtra(Request.PROXY, proxy);
AuthState authState = new AuthState();
authState.update(new BasicScheme(), new UsernamePasswordCredentials(proxy.getUsername(), proxy.getPassword()));
httpContext.setAttribute(HttpClientContext.PROXY_AUTH_STATE, authState);
}
HttpUriRequest httpUriRequest = httpUriRequestConverter.convert(request, site, proxy);
CloseableHttpClient httpClient = getHttpClient(site);
HttpUriRequest httpUriRequest = httpUriRequestConverter.convert(request, site, proxy);
try {
httpResponse = httpClient.execute(httpUriRequest, httpContext);
statusCode = httpResponse.getStatusLine().getStatusCode();
@ -133,10 +135,13 @@ public class HttpClientDownloader extends AbstractDownloader {
page.setUrl(new PlainText(request.getUrl()));
page.setRequest(request);
page.setStatusCode(httpResponse.getStatusLine().getStatusCode());
if (responseHeader) {
page.setHeaders(HttpClientUtils.convertHeaders(httpResponse.getAllHeaders()));
}
return page;
}
protected String getContent(String charset, HttpResponse httpResponse) throws IOException {
private String getContent(String charset, HttpResponse httpResponse) throws IOException {
if (charset == null) {
byte[] contentBytes = IOUtils.toByteArray(httpResponse.getEntity().getContent());
String htmlCharset = getHtmlCharset(httpResponse, contentBytes);
@ -151,7 +156,7 @@ public class HttpClientDownloader extends AbstractDownloader {
}
}
protected String getHtmlCharset(HttpResponse httpResponse, byte[] contentBytes) throws IOException {
private String getHtmlCharset(HttpResponse httpResponse, byte[] contentBytes) throws IOException {
return CharsetUtils.detectCharset(httpResponse.getEntity().getContentType().getValue(), contentBytes);
}
}

@ -39,7 +39,7 @@ public class HttpUriRequestConverter {
requestConfigBuilder.setConnectionRequestTimeout(site.getTimeOut())
.setSocketTimeout(site.getTimeOut())
.setConnectTimeout(site.getTimeOut())
.setCookieSpec(CookieSpecs.BEST_MATCH);
.setCookieSpec(CookieSpecs.STANDARD);
}
if (proxy != null) {

@ -7,7 +7,7 @@ import org.apache.http.HttpResponse;
* Date: 17/3/20
* Time: 10:52
*/
public interface BannedChecker {
public interface ResponseChecker {
boolean isBanned(HttpResponse httpResponse);
}

@ -0,0 +1,28 @@
package us.codecraft.webmagic.utils;
import org.apache.http.Header;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
/**
* @author code4crafter@gmail.com
* Date: 17/3/27
*/
public abstract class HttpClientUtils {
public static Map<String,List<String>> convertHeaders(Header[] headers){
Map<String,List<String>> results = new HashMap<String, List<String>>();
for (Header header : headers) {
List<String> list = results.get(header.getName());
if (list == null) {
list = new ArrayList<String>();
results.put(header.getName(), list);
}
list.add(header.getValue());
}
return results;
}
}
Loading…
Cancel
Save