add response headers to Page #508

pull/524/head
yihua.huang 8 years ago
parent ba000b364c
commit f23e138c72

@ -8,6 +8,7 @@ import us.codecraft.webmagic.utils.UrlUtils;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.List; import java.util.List;
import java.util.Map;
/** /**
* Object storing extracted result and urls to fetch.<br> * Object storing extracted result and urls to fetch.<br>
@ -38,6 +39,8 @@ public class Page {
private Selectable url; private Selectable url;
private Map<String,List<String>> headers;
private int statusCode; private int statusCode;
private boolean needCycleRetry; private boolean needCycleRetry;
@ -210,6 +213,14 @@ public class Page {
return this; return this;
} }
public Map<String, List<String>> getHeaders() {
return headers;
}
public void setHeaders(Map<String, List<String>> headers) {
this.headers = headers;
}
@Override @Override
public String toString() { public String toString() {
return "Page{" + return "Page{" +
@ -217,7 +228,9 @@ public class Page {
", resultItems=" + resultItems + ", resultItems=" + resultItems +
", rawText='" + rawText + '\'' + ", rawText='" + rawText + '\'' +
", url=" + url + ", url=" + url +
", headers=" + headers +
", statusCode=" + statusCode + ", statusCode=" + statusCode +
", needCycleRetry=" + needCycleRetry +
", targetRequests=" + targetRequests + ", targetRequests=" + targetRequests +
'}'; '}';
} }

@ -18,7 +18,6 @@ public class Request implements Serializable {
private static final long serialVersionUID = 2062192774891352043L; private static final long serialVersionUID = 2062192774891352043L;
public static final String CYCLE_TRIED_TIMES = "_cycle_tried_times"; public static final String CYCLE_TRIED_TIMES = "_cycle_tried_times";
public static final String PROXY = "proxy";
private String url; private String url;

@ -39,16 +39,6 @@ public class Site {
private boolean useGzip = true; private boolean useGzip = true;
/**
* @see us.codecraft.webmagic.utils.HttpConstant.Header
* @deprecated
*/
public static interface HeaderConst {
public static final String REFERER = "Referer";
}
static { static {
DEFAULT_STATUS_CODE_SET.add(200); DEFAULT_STATUS_CODE_SET.add(200);
} }

@ -23,6 +23,7 @@ import us.codecraft.webmagic.proxy.Proxy;
import us.codecraft.webmagic.proxy.ProxyProvider; import us.codecraft.webmagic.proxy.ProxyProvider;
import us.codecraft.webmagic.selector.PlainText; import us.codecraft.webmagic.selector.PlainText;
import us.codecraft.webmagic.utils.CharsetUtils; import us.codecraft.webmagic.utils.CharsetUtils;
import us.codecraft.webmagic.utils.HttpClientUtils;
import java.io.IOException; import java.io.IOException;
import java.nio.charset.Charset; import java.nio.charset.Charset;
@ -49,6 +50,8 @@ public class HttpClientDownloader extends AbstractDownloader {
private ProxyProvider proxyProvider; private ProxyProvider proxyProvider;
private boolean responseHeader = true;
public void setHttpUriRequestConverter(HttpUriRequestConverter httpUriRequestConverter) { public void setHttpUriRequestConverter(HttpUriRequestConverter httpUriRequestConverter) {
this.httpUriRequestConverter = httpUriRequestConverter; this.httpUriRequestConverter = httpUriRequestConverter;
} }
@ -88,13 +91,12 @@ public class HttpClientDownloader extends AbstractDownloader {
HttpContext httpContext = new BasicHttpContext(); HttpContext httpContext = new BasicHttpContext();
if (proxyProvider != null) { if (proxyProvider != null) {
proxy = proxyProvider.getProxy(task); proxy = proxyProvider.getProxy(task);
request.putExtra(Request.PROXY, proxy);
AuthState authState = new AuthState(); AuthState authState = new AuthState();
authState.update(new BasicScheme(), new UsernamePasswordCredentials(proxy.getUsername(), proxy.getPassword())); authState.update(new BasicScheme(), new UsernamePasswordCredentials(proxy.getUsername(), proxy.getPassword()));
httpContext.setAttribute(HttpClientContext.PROXY_AUTH_STATE, authState); httpContext.setAttribute(HttpClientContext.PROXY_AUTH_STATE, authState);
} }
HttpUriRequest httpUriRequest = httpUriRequestConverter.convert(request, site, proxy);
CloseableHttpClient httpClient = getHttpClient(site); CloseableHttpClient httpClient = getHttpClient(site);
HttpUriRequest httpUriRequest = httpUriRequestConverter.convert(request, site, proxy);
try { try {
httpResponse = httpClient.execute(httpUriRequest, httpContext); httpResponse = httpClient.execute(httpUriRequest, httpContext);
statusCode = httpResponse.getStatusLine().getStatusCode(); statusCode = httpResponse.getStatusLine().getStatusCode();
@ -133,10 +135,13 @@ public class HttpClientDownloader extends AbstractDownloader {
page.setUrl(new PlainText(request.getUrl())); page.setUrl(new PlainText(request.getUrl()));
page.setRequest(request); page.setRequest(request);
page.setStatusCode(httpResponse.getStatusLine().getStatusCode()); page.setStatusCode(httpResponse.getStatusLine().getStatusCode());
if (responseHeader) {
page.setHeaders(HttpClientUtils.convertHeaders(httpResponse.getAllHeaders()));
}
return page; return page;
} }
protected String getContent(String charset, HttpResponse httpResponse) throws IOException { private String getContent(String charset, HttpResponse httpResponse) throws IOException {
if (charset == null) { if (charset == null) {
byte[] contentBytes = IOUtils.toByteArray(httpResponse.getEntity().getContent()); byte[] contentBytes = IOUtils.toByteArray(httpResponse.getEntity().getContent());
String htmlCharset = getHtmlCharset(httpResponse, contentBytes); String htmlCharset = getHtmlCharset(httpResponse, contentBytes);
@ -151,7 +156,7 @@ public class HttpClientDownloader extends AbstractDownloader {
} }
} }
protected String getHtmlCharset(HttpResponse httpResponse, byte[] contentBytes) throws IOException { private String getHtmlCharset(HttpResponse httpResponse, byte[] contentBytes) throws IOException {
return CharsetUtils.detectCharset(httpResponse.getEntity().getContentType().getValue(), contentBytes); return CharsetUtils.detectCharset(httpResponse.getEntity().getContentType().getValue(), contentBytes);
} }
} }

@ -39,7 +39,7 @@ public class HttpUriRequestConverter {
requestConfigBuilder.setConnectionRequestTimeout(site.getTimeOut()) requestConfigBuilder.setConnectionRequestTimeout(site.getTimeOut())
.setSocketTimeout(site.getTimeOut()) .setSocketTimeout(site.getTimeOut())
.setConnectTimeout(site.getTimeOut()) .setConnectTimeout(site.getTimeOut())
.setCookieSpec(CookieSpecs.BEST_MATCH); .setCookieSpec(CookieSpecs.STANDARD);
} }
if (proxy != null) { if (proxy != null) {

@ -7,7 +7,7 @@ import org.apache.http.HttpResponse;
* Date: 17/3/20 * Date: 17/3/20
* Time: 10:52 * Time: 10:52
*/ */
public interface BannedChecker { public interface ResponseChecker {
boolean isBanned(HttpResponse httpResponse); boolean isBanned(HttpResponse httpResponse);
} }

@ -0,0 +1,28 @@
package us.codecraft.webmagic.utils;
import org.apache.http.Header;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
/**
* @author code4crafter@gmail.com
* Date: 17/3/27
*/
public abstract class HttpClientUtils {
public static Map<String,List<String>> convertHeaders(Header[] headers){
Map<String,List<String>> results = new HashMap<String, List<String>>();
for (Header header : headers) {
List<String> list = results.get(header.getName());
if (list == null) {
list = new ArrayList<String>();
results.put(header.getName(), list);
}
list.add(header.getValue());
}
return results;
}
}
Loading…
Cancel
Save