Merge pull request #513 from xbynet/master

Request支持设置header与cookie、新增POST请求时,XML、JSON参数支持、Page支持获取响应header
pull/502/head^2
Yihua Huang 8 years ago committed by GitHub
commit 25df6650d9

@ -1,14 +1,16 @@
package us.codecraft.webmagic;
import java.util.ArrayList;
import java.util.List;
import org.apache.commons.lang3.StringUtils;
import org.apache.http.Header;
import us.codecraft.webmagic.selector.Html;
import us.codecraft.webmagic.selector.Json;
import us.codecraft.webmagic.selector.Selectable;
import us.codecraft.webmagic.utils.UrlUtils;
import java.util.ArrayList;
import java.util.List;
/**
* Object storing extracted result and urls to fetch.<br>
* Not thread safe.<br>
@ -43,6 +45,11 @@ public class Page {
private boolean needCycleRetry;
private List<Request> targetRequests = new ArrayList<Request>();
/**
* Http
*/
private Header[] headers=null;
public Page() {
}
@ -210,6 +217,14 @@ public class Page {
return this;
}
public Header[] getHeaders() {
return headers;
}
public void setHeaders(Header[] headers) {
this.headers = headers;
}
@Override
public String toString() {
return "Page{" +
@ -219,6 +234,11 @@ public class Page {
", url=" + url +
", statusCode=" + statusCode +
", targetRequests=" + targetRequests +
", headers=" + headers+
'}';
}
}

@ -1,11 +1,21 @@
package us.codecraft.webmagic;
import us.codecraft.webmagic.utils.Experimental;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.apache.http.Header;
import org.apache.http.HttpEntity;
import org.apache.http.cookie.Cookie;
import org.apache.http.entity.StringEntity;
import org.apache.http.impl.cookie.BasicClientCookie;
import org.apache.http.message.BasicHeader;
import us.codecraft.webmagic.utils.Experimental;
import us.codecraft.webmagic.utils.UrlUtils;
/**
* Object contains url to crawl.<br>
* It contains some additional information.<br>
@ -33,6 +43,18 @@ public class Request implements Serializable {
* POST/GET param set
* */
private Map<String,String> params=new HashMap<String, String>();
/**
* support for json,xml or more,post使paramsnameValuePair extra
*/
private HttpEntity entity;
/**
* cookies for current url, if not set use Site's cookies
*/
private List<Cookie> cookies=new ArrayList<Cookie>();
private List<Header> headers=new ArrayList<Header>();
/**
* Priority of the request.<br>
@ -145,12 +167,59 @@ public class Request implements Serializable {
if (method != null ? !method.equals(request.method) : request.method != null) return false;
return params != null ? params.equals(request.params) : request.params == null;
}
public void addHeader(String name,String value){
Header header=new BasicHeader(name,value);
headers.add(header);
}
public List<Header> getHeaders(){
return headers;
}
public void addCookie(String key,String value){
BasicClientCookie c=new BasicClientCookie(key, value);
c.setDomain(UrlUtils.getDomain(url));
cookies.add(c);
}
public List<Cookie> getCookies() {
return cookies;
}
public void setCookies(List<Cookie> cookies) {
this.cookies = cookies;
}
/**
* json
*/
public void setJsonParam(String jsonStr,String encoding){
StringEntity e=new StringEntity(jsonStr,encoding==null?"UTF-8":encoding);
e.setContentEncoding(encoding==null?"UTF-8":encoding);
e.setContentType("application/json");
entity=e;
}
/**
* xml
*/
public void setXmlParam(String xmlStr,String encoding){
StringEntity e=new StringEntity(xmlStr,encoding==null?"UTF-8":encoding);
e.setContentEncoding(encoding==null?"UTF-8":encoding);
e.setContentType("text/xml");
entity=e;
}
public HttpEntity getEntity() {
return entity;
}
public void setEntity(HttpEntity entity) {
this.entity = entity;
}
@Override
public int hashCode() {
int result = url != null ? url.hashCode() : 0;
result = 31 * result + (method != null ? method.hashCode() : 0);
result = 31 * result + (params != null ? params.hashCode() : 0);
result = 31 * result + (headers != null ? headers.hashCode() : 0);
result = 31 * result + (entity != null ? entity.hashCode() : 0);
result = 31 * result + (cookies != null ? cookies.hashCode() : 0);
return result;
}
@ -162,6 +231,10 @@ public class Request implements Serializable {
", extras=" + extras +
", params=" + params +
", priority=" + priority +
", headers=" + headers +
", entity=" + entity +
", cookies="+ cookies+
'}';
}
}

@ -1,21 +1,37 @@
package us.codecraft.webmagic.downloader;
import java.io.IOException;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import org.apache.commons.collections.CollectionUtils;
import org.apache.commons.io.IOUtils;
import org.apache.http.Header;
import org.apache.http.HttpHost;
import org.apache.http.HttpResponse;
import org.apache.http.NameValuePair;
import org.apache.http.annotation.ThreadSafe;
import org.apache.http.client.CookieStore;
import org.apache.http.client.config.CookieSpecs;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.entity.UrlEncodedFormEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpUriRequest;
import org.apache.http.client.methods.RequestBuilder;
import org.apache.http.client.protocol.HttpClientContext;
import org.apache.http.cookie.Cookie;
import org.apache.http.impl.client.BasicCookieStore;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.message.BasicNameValuePair;
import org.apache.http.util.EntityUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Site;
@ -26,10 +42,6 @@ import us.codecraft.webmagic.utils.CharsetUtils;
import us.codecraft.webmagic.utils.HttpConstant;
import us.codecraft.webmagic.utils.WMCollections;
import java.io.IOException;
import java.nio.charset.Charset;
import java.util.*;
/**
* The http downloader based on HttpClient.
@ -94,11 +106,26 @@ public class HttpClientDownloader extends AbstractDownloader {
}
HttpUriRequest httpUriRequest = getHttpUriRequest(request, site, headers, proxyHost);
httpResponse = getHttpClient(site, proxy).execute(httpUriRequest);
HttpClientContext context=null;
if(request.getCookies()!=null && CollectionUtils.isNotEmpty(request.getCookies())){
context=new HttpClientContext();
CookieStore cookieStore=new BasicCookieStore();
for(Cookie c:request.getCookies()){
cookieStore.addCookie(c);
}
context.setCookieStore(cookieStore);
}
if(request.getHeaders()!=null && CollectionUtils.isNotEmpty(request.getHeaders())){
for(Header h:request.getHeaders()){
httpUriRequest.setHeader(h);
}
}
httpResponse = getHttpClient(site, proxy).execute(httpUriRequest,context);
statusCode = httpResponse.getStatusLine().getStatusCode();
request.putExtra(Request.STATUS_CODE, statusCode);
if (statusAccept(acceptStatCode, statusCode)) {
Page page = handleResponse(request, charset, httpResponse, task);
page.setHeaders(httpResponse.getAllHeaders());
onSuccess(request);
return page;
} else {
@ -164,7 +191,11 @@ public class HttpClientDownloader extends AbstractDownloader {
//default get
return addQueryParams(RequestBuilder.get(),request.getParams());
} else if (method.equalsIgnoreCase(HttpConstant.Method.POST)) {
return addFormParams(RequestBuilder.post(), (NameValuePair[]) request.getExtra("nameValuePair"), request.getParams());
if(request.getEntity()!=null){
return RequestBuilder.post().setEntity(request.getEntity());
}else{
return addFormParams(RequestBuilder.post(), (NameValuePair[]) request.getExtra("nameValuePair"), request.getParams());
}
} else if (method.equalsIgnoreCase(HttpConstant.Method.HEAD)) {
return addQueryParams(RequestBuilder.head(),request.getParams());
} else if (method.equalsIgnoreCase(HttpConstant.Method.PUT)) {

@ -26,7 +26,7 @@ public abstract class CharsetUtils {
// charset
// 1、encoding in http header Content-Type
charset = UrlUtils.getCharset(contentType);
if (StringUtils.isNotBlank(contentType)) {
if (StringUtils.isNotBlank(contentType) && StringUtils.isNotBlank(charset)) {
logger.debug("Auto get charset: {}", charset);
return charset;
}

Loading…
Cancel
Save