|
|
|
@ -1,5 +1,11 @@
|
|
|
|
|
package us.codecraft.webmagic.downloader;
|
|
|
|
|
|
|
|
|
|
import java.io.IOException;
|
|
|
|
|
import java.nio.charset.Charset;
|
|
|
|
|
import java.util.Map;
|
|
|
|
|
import java.util.concurrent.ConcurrentHashMap;
|
|
|
|
|
import java.util.function.Predicate;
|
|
|
|
|
|
|
|
|
|
import org.apache.commons.io.IOUtils;
|
|
|
|
|
import org.apache.http.HttpResponse;
|
|
|
|
|
import org.apache.http.client.methods.CloseableHttpResponse;
|
|
|
|
@ -7,6 +13,7 @@ import org.apache.http.impl.client.CloseableHttpClient;
|
|
|
|
|
import org.apache.http.util.EntityUtils;
|
|
|
|
|
import org.slf4j.Logger;
|
|
|
|
|
import org.slf4j.LoggerFactory;
|
|
|
|
|
|
|
|
|
|
import us.codecraft.webmagic.Page;
|
|
|
|
|
import us.codecraft.webmagic.Request;
|
|
|
|
|
import us.codecraft.webmagic.Site;
|
|
|
|
@ -17,13 +24,6 @@ import us.codecraft.webmagic.selector.PlainText;
|
|
|
|
|
import us.codecraft.webmagic.utils.CharsetUtils;
|
|
|
|
|
import us.codecraft.webmagic.utils.HttpClientUtils;
|
|
|
|
|
|
|
|
|
|
import java.io.IOException;
|
|
|
|
|
import java.nio.charset.Charset;
|
|
|
|
|
import java.util.HashMap;
|
|
|
|
|
import java.util.Map;
|
|
|
|
|
import java.util.concurrent.ConcurrentHashMap;
|
|
|
|
|
import java.util.function.Predicate;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* The http downloader based on HttpClient.
|
|
|
|
@ -92,13 +92,7 @@ public class HttpClientDownloader extends AbstractDownloader {
|
|
|
|
|
return page;
|
|
|
|
|
} catch (IOException e) {
|
|
|
|
|
logger.warn("download page {} error", request.getUrl(), e);
|
|
|
|
|
onError(request, e, proxyProvider);
|
|
|
|
|
if (proxyProvider != null && refreshProxyOnError.test(e)) {
|
|
|
|
|
proxyProvider.refreshProxy(task);
|
|
|
|
|
}
|
|
|
|
|
if(refreshClientOnError.test(e)) {
|
|
|
|
|
httpClients.remove(task.getSite().getDomain());
|
|
|
|
|
}
|
|
|
|
|
onError(request);
|
|
|
|
|
return page;
|
|
|
|
|
} finally {
|
|
|
|
|
if (httpResponse != null) {
|
|
|
|
|