diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/CustomRedirectStrategy.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/CustomRedirectStrategy.java new file mode 100644 index 00000000..7c32dbc1 --- /dev/null +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/CustomRedirectStrategy.java @@ -0,0 +1,44 @@ +package us.codecraft.webmagic.downloader; + +import java.net.URI; + +import org.apache.http.HttpRequest; +import org.apache.http.HttpResponse; +import org.apache.http.ProtocolException; +import org.apache.http.client.methods.HttpGet; +import org.apache.http.client.methods.HttpPost; +import org.apache.http.client.methods.HttpRequestWrapper; +import org.apache.http.client.methods.HttpUriRequest; +import org.apache.http.impl.client.LaxRedirectStrategy; +import org.apache.http.protocol.HttpContext; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + *支持post 302跳转策略实现类 + *HttpClient默认跳转:httpClientBuilder.setRedirectStrategy(new LaxRedirectStrategy()); + *上述代码在post/redirect/post这种情况下不会传递原有请求的数据信息。所以参考了下SeimiCrawler这个项目的重定向策略。 + *原代码地址:https://github.com/zhegexiaohuozi/SeimiCrawler/blob/master/project/src/main/java/cn/wanghaomiao/seimi/http/hc/SeimiRedirectStrategy.java + */ +public class CustomRedirectStrategy extends LaxRedirectStrategy { + private Logger logger = LoggerFactory.getLogger(getClass()); + + @Override + public HttpUriRequest getRedirect(HttpRequest request, HttpResponse response, HttpContext context) throws ProtocolException { + URI uri = getLocationURI(request, response, context); + String method = request.getRequestLine().getMethod(); + if ("post".equalsIgnoreCase(method)) { + try { + HttpRequestWrapper httpRequestWrapper = (HttpRequestWrapper) request; + httpRequestWrapper.setURI(uri); + httpRequestWrapper.removeHeaders("Content-Length"); + return httpRequestWrapper; + } catch (Exception e) { + logger.error("强转为HttpRequestWrapper出错"); + } + return new HttpPost(uri); + } else { + return new HttpGet(uri); + } + } +} diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java index 80a7e29e..ef98a479 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java @@ -89,8 +89,9 @@ public class HttpClientGenerator { } }); } - - + //解决post/redirect/post 302跳转问题 + httpClientBuilder.setRedirectStrategy(new CustomRedirectStrategy()); + SocketConfig socketConfig = SocketConfig.custom().setSoTimeout(site.getTimeOut()).setSoKeepAlive(true).setTcpNoDelay(true).build(); httpClientBuilder.setDefaultSocketConfig(socketConfig); connectionManager.setDefaultSocketConfig(socketConfig);