From 1c24baa8d12e27e311527a09c569f9b425d8b5f0 Mon Sep 17 00:00:00 2001 From: "xbynet@outlook.com" Date: Wed, 29 Mar 2017 02:27:06 +0800 Subject: [PATCH 1/2] =?UTF-8?q?Request=E6=94=AF=E6=8C=81=E8=AE=BE=E7=BD=AE?= =?UTF-8?q?header=E4=B8=8Ecookie=20=E6=96=B0=E5=A2=9EPOST=E8=AF=B7?= =?UTF-8?q?=E6=B1=82=E6=97=B6,XML=E3=80=81JSON=E5=8F=82=E6=95=B0=E6=94=AF?= =?UTF-8?q?=E6=8C=81=20Page=E6=94=AF=E6=8C=81=E8=8E=B7=E5=8F=96=E5=93=8D?= =?UTF-8?q?=E5=BA=94header?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../main/java/us/codecraft/webmagic/Page.java | 26 ++++++- .../java/us/codecraft/webmagic/Request.java | 77 ++++++++++++++++++- .../downloader/HttpClientDownloader.java | 43 +++++++++-- 3 files changed, 135 insertions(+), 11 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java index 7c0064d1..1a6527dd 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java @@ -1,14 +1,16 @@ package us.codecraft.webmagic; +import java.util.ArrayList; +import java.util.List; + import org.apache.commons.lang3.StringUtils; +import org.apache.http.Header; + import us.codecraft.webmagic.selector.Html; import us.codecraft.webmagic.selector.Json; import us.codecraft.webmagic.selector.Selectable; import us.codecraft.webmagic.utils.UrlUtils; -import java.util.ArrayList; -import java.util.List; - /** * Object storing extracted result and urls to fetch.
* Not thread safe.
@@ -43,6 +45,11 @@ public class Page { private boolean needCycleRetry; private List targetRequests = new ArrayList(); + + /** + * Http响应头 + */ + private Header[] headers=null; public Page() { } @@ -210,6 +217,14 @@ public class Page { return this; } + public Header[] getHeaders() { + return headers; + } + + public void setHeaders(Header[] headers) { + this.headers = headers; + } + @Override public String toString() { return "Page{" + @@ -219,6 +234,11 @@ public class Page { ", url=" + url + ", statusCode=" + statusCode + ", targetRequests=" + targetRequests + + ", headers=" + headers+ '}'; } + + + + } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java index c8c59782..d44f61f2 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java @@ -1,11 +1,21 @@ package us.codecraft.webmagic; -import us.codecraft.webmagic.utils.Experimental; - import java.io.Serializable; +import java.util.ArrayList; import java.util.HashMap; +import java.util.List; import java.util.Map; +import org.apache.http.Header; +import org.apache.http.HttpEntity; +import org.apache.http.cookie.Cookie; +import org.apache.http.entity.StringEntity; +import org.apache.http.impl.cookie.BasicClientCookie; +import org.apache.http.message.BasicHeader; + +import us.codecraft.webmagic.utils.Experimental; +import us.codecraft.webmagic.utils.UrlUtils; + /** * Object contains url to crawl.
* It contains some additional information.
@@ -33,6 +43,18 @@ public class Request implements Serializable { * POST/GET param set * */ private Map params=new HashMap(); + + /** + * support for json,xml or more,在post时,设置此选项会使params参数和nameValuePair extra失效。 + */ + private HttpEntity entity; + + /** + * cookies for current url, if not set use Site's cookies + */ + private List cookies=new ArrayList(); + + private List
headers=new ArrayList
(); /** * Priority of the request.
@@ -145,12 +167,59 @@ public class Request implements Serializable { if (method != null ? !method.equals(request.method) : request.method != null) return false; return params != null ? params.equals(request.params) : request.params == null; } + public void addHeader(String name,String value){ + Header header=new BasicHeader(name,value); + headers.add(header); + } + public List
getHeaders(){ + return headers; + } + public void addCookie(String key,String value){ + BasicClientCookie c=new BasicClientCookie(key, value); + c.setDomain(UrlUtils.getDomain(url)); + cookies.add(c); + } + public List getCookies() { + return cookies; + } + public void setCookies(List cookies) { + this.cookies = cookies; + } + /** + * 设置json参数 + */ + public void setJsonParam(String jsonStr,String encoding){ + StringEntity e=new StringEntity(jsonStr,encoding==null?"UTF-8":encoding); + e.setContentEncoding(encoding==null?"UTF-8":encoding); + e.setContentType("application/json"); + entity=e; + } + /** + * 设置xml参数 + */ + public void setXmlParam(String xmlStr,String encoding){ + StringEntity e=new StringEntity(xmlStr,encoding==null?"UTF-8":encoding); + e.setContentEncoding(encoding==null?"UTF-8":encoding); + e.setContentType("text/xml"); + entity=e; + } + public HttpEntity getEntity() { + return entity; + } + + public void setEntity(HttpEntity entity) { + this.entity = entity; + } @Override public int hashCode() { int result = url != null ? url.hashCode() : 0; result = 31 * result + (method != null ? method.hashCode() : 0); result = 31 * result + (params != null ? params.hashCode() : 0); + result = 31 * result + (headers != null ? headers.hashCode() : 0); + result = 31 * result + (entity != null ? entity.hashCode() : 0); + result = 31 * result + (cookies != null ? cookies.hashCode() : 0); + return result; } @@ -162,6 +231,10 @@ public class Request implements Serializable { ", extras=" + extras + ", params=" + params + ", priority=" + priority + + ", headers=" + headers + + ", entity=" + entity + + ", cookies="+ cookies+ '}'; } + } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java index fa907a1d..669ba376 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java @@ -1,21 +1,37 @@ package us.codecraft.webmagic.downloader; +import java.io.IOException; +import java.nio.charset.Charset; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Set; + +import org.apache.commons.collections.CollectionUtils; import org.apache.commons.io.IOUtils; +import org.apache.http.Header; import org.apache.http.HttpHost; import org.apache.http.HttpResponse; import org.apache.http.NameValuePair; import org.apache.http.annotation.ThreadSafe; +import org.apache.http.client.CookieStore; import org.apache.http.client.config.CookieSpecs; import org.apache.http.client.config.RequestConfig; import org.apache.http.client.entity.UrlEncodedFormEntity; import org.apache.http.client.methods.CloseableHttpResponse; import org.apache.http.client.methods.HttpUriRequest; import org.apache.http.client.methods.RequestBuilder; +import org.apache.http.client.protocol.HttpClientContext; +import org.apache.http.cookie.Cookie; +import org.apache.http.impl.client.BasicCookieStore; import org.apache.http.impl.client.CloseableHttpClient; import org.apache.http.message.BasicNameValuePair; import org.apache.http.util.EntityUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; + import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Request; import us.codecraft.webmagic.Site; @@ -26,10 +42,6 @@ import us.codecraft.webmagic.utils.CharsetUtils; import us.codecraft.webmagic.utils.HttpConstant; import us.codecraft.webmagic.utils.WMCollections; -import java.io.IOException; -import java.nio.charset.Charset; -import java.util.*; - /** * The http downloader based on HttpClient. @@ -94,11 +106,26 @@ public class HttpClientDownloader extends AbstractDownloader { } HttpUriRequest httpUriRequest = getHttpUriRequest(request, site, headers, proxyHost); - httpResponse = getHttpClient(site, proxy).execute(httpUriRequest); + HttpClientContext context=null; + if(request.getCookies()!=null && CollectionUtils.isNotEmpty(request.getCookies())){ + context=new HttpClientContext(); + CookieStore cookieStore=new BasicCookieStore(); + for(Cookie c:request.getCookies()){ + cookieStore.addCookie(c); + } + context.setCookieStore(cookieStore); + } + if(request.getHeaders()!=null && CollectionUtils.isNotEmpty(request.getHeaders())){ + for(Header h:request.getHeaders()){ + httpUriRequest.setHeader(h); + } + } + httpResponse = getHttpClient(site, proxy).execute(httpUriRequest,context); statusCode = httpResponse.getStatusLine().getStatusCode(); request.putExtra(Request.STATUS_CODE, statusCode); if (statusAccept(acceptStatCode, statusCode)) { Page page = handleResponse(request, charset, httpResponse, task); + page.setHeaders(httpResponse.getAllHeaders()); onSuccess(request); return page; } else { @@ -164,7 +191,11 @@ public class HttpClientDownloader extends AbstractDownloader { //default get return addQueryParams(RequestBuilder.get(),request.getParams()); } else if (method.equalsIgnoreCase(HttpConstant.Method.POST)) { - return addFormParams(RequestBuilder.post(), (NameValuePair[]) request.getExtra("nameValuePair"), request.getParams()); + if(request.getEntity()!=null){ + return RequestBuilder.post().setEntity(request.getEntity()); + }else{ + return addFormParams(RequestBuilder.post(), (NameValuePair[]) request.getExtra("nameValuePair"), request.getParams()); + } } else if (method.equalsIgnoreCase(HttpConstant.Method.HEAD)) { return addQueryParams(RequestBuilder.head(),request.getParams()); } else if (method.equalsIgnoreCase(HttpConstant.Method.PUT)) { From c93a8a27227defa82788783ca012e60dbd2a5014 Mon Sep 17 00:00:00 2001 From: xbynet Date: Fri, 31 Mar 2017 18:27:18 +0800 Subject: [PATCH 2/2] =?UTF-8?q?=E4=BF=AE=E5=A4=8D=E5=AD=97=E7=AC=A6?= =?UTF-8?q?=E7=BC=96=E7=A0=81=E6=A3=80=E6=B5=8BBUG?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../src/main/java/us/codecraft/webmagic/utils/CharsetUtils.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/utils/CharsetUtils.java b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/CharsetUtils.java index 50b4f1b6..ccf00a46 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/utils/CharsetUtils.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/CharsetUtils.java @@ -26,7 +26,7 @@ public abstract class CharsetUtils { // charset // 1、encoding in http header Content-Type charset = UrlUtils.getCharset(contentType); - if (StringUtils.isNotBlank(contentType)) { + if (StringUtils.isNotBlank(contentType) && StringUtils.isNotBlank(charset)) { logger.debug("Auto get charset: {}", charset); return charset; }