diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java
index 11ba1c58..5d2af734 100644
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java
@@ -1,7 +1,6 @@
package us.codecraft.webmagic.downloader;
import com.google.common.collect.Sets;
-import info.monitorenter.cpdetector.io.CodepageDetectorProxy;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang.StringUtils;
import org.apache.http.HttpResponse;
@@ -24,12 +23,11 @@ import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Task;
-import us.codecraft.webmagic.utils.HttpConstant;
import us.codecraft.webmagic.selector.PlainText;
+import us.codecraft.webmagic.utils.HttpConstant;
import us.codecraft.webmagic.utils.UrlUtils;
import java.io.IOException;
-import java.net.URL;
import java.util.HashMap;
import java.util.Map;
import java.util.Set;
@@ -122,44 +120,33 @@ public class HttpClientDownloader extends AbstractDownloader {
}
}
- private static CodepageDetectorProxy detector = CodepageDetectorProxy.getInstance();
protected String getHtmlCharset(CloseableHttpResponse httpResponse) throws IOException {
- // 1、head头部包含编码集
+ // 1、encoding in http header Content-Type
String value = httpResponse.getEntity().getContentType().getValue();
String charset = UrlUtils.getCharset(value);
- if(StringUtils.isEmpty(charset)) {
- // 2、meta元素中包含编码集
+ if (StringUtils.isEmpty(charset)) {
+ // 2、charset in meta
String content = IOUtils.toString(httpResponse.getEntity().getContent());
- if(StringUtils.isNotEmpty(content)) {
+ if (StringUtils.isNotEmpty(content)) {
Document document = Jsoup.parse(content);
Elements links = document.select("meta");
- for(Element link : links) {
- // 2.1、处理场景:
+ for (Element link : links) {
+ // 2.1、
String metaContent = link.attr("content");
String metaCharset = link.attr("charset");
- if(metaContent.indexOf("charset") != -1) {
+ if (metaContent.indexOf("charset") != -1) {
metaContent = metaContent.substring(metaContent.indexOf("charset"), metaContent.length());
charset = metaContent.split("=")[1];
break;
}
- // 2.2、处理场景:
- else if(StringUtils.isNotEmpty(metaCharset)) {
+ // 2.2、
+ else if (StringUtils.isNotEmpty(metaCharset)) {
charset = metaCharset;
break;
}
}
-
- // 3、以上两种都不包含的场景
- if(StringUtils.isEmpty(charset)) {
- java.nio.charset.Charset nioCharset = null;
- try {
- nioCharset = detector.detectCodepage(httpResponse.getEntity().getContent(), content.length());
- charset = nioCharset.name();
- } catch (IOException e) {
- // ignore
- }
- }
+ // 3、todo use tools as cpdetector for content decode
}
}
return charset;
diff --git a/webmagic-core/src/main/lib/antlr-2.7.4.jar b/webmagic-core/src/main/lib/antlr-2.7.4.jar
deleted file mode 100644
index 45e45b5c..00000000
Binary files a/webmagic-core/src/main/lib/antlr-2.7.4.jar and /dev/null differ
diff --git a/webmagic-core/src/main/lib/cpdetector_1.0.10.jar b/webmagic-core/src/main/lib/cpdetector_1.0.10.jar
deleted file mode 100644
index 47329f22..00000000
Binary files a/webmagic-core/src/main/lib/cpdetector_1.0.10.jar and /dev/null differ
diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpClientDownloaderTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpClientDownloaderTest.java
index 072de135..09855a04 100644
--- a/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpClientDownloaderTest.java
+++ b/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpClientDownloaderTest.java
@@ -1,7 +1,6 @@
package us.codecraft.webmagic.downloader;
import org.apache.http.client.methods.CloseableHttpResponse;
-import org.apache.http.client.methods.HttpUriRequest;
import org.apache.http.impl.client.CloseableHttpClient;
import org.junit.Ignore;
import org.junit.Test;
@@ -11,6 +10,7 @@ import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.selector.Html;
+import java.io.IOException;
import java.io.UnsupportedEncodingException;
import static org.assertj.core.api.Assertions.assertThat;
@@ -57,29 +57,20 @@ public class HttpClientDownloaderTest {
}
@Test
- public void testGetHtmlCharset() {
+ public void testGetHtmlCharset() throws IOException {
HttpClientDownloader downloader = new HttpClientDownloader();
Site site = Site.me();
CloseableHttpClient httpClient = new HttpClientGenerator().getClient(site);
- try {
- // 头部包含编码
- Request requestGBK = new Request("http://sports.163.com/14/0514/13/9S7986F300051CA1.html#p=9RGQDGGH0AI90005");
- CloseableHttpResponse httpResponse = httpClient.execute(downloader.getHttpUriRequest(requestGBK, site, null));
- String charset = downloader.getHtmlCharset(httpResponse);
- assertEquals(charset, "GBK");
+ // encoding in http header Content-Type
+ Request requestGBK = new Request("http://sports.163.com/14/0514/13/9S7986F300051CA1.html#p=9RGQDGGH0AI90005");
+ CloseableHttpResponse httpResponse = httpClient.execute(downloader.getHttpUriRequest(requestGBK, site, null));
+ String charset = downloader.getHtmlCharset(httpResponse);
+ assertEquals(charset, "GBK");
- // meta包含编码
- Request requestUTF_8 = new Request("http://preshing.com/");
- httpResponse = httpClient.execute(downloader.getHttpUriRequest(requestUTF_8, site, null));
- charset = downloader.getHtmlCharset(httpResponse);
- assertEquals(charset, "utf-8");
-
-// Request request = new Request("http://sports.163.com/14/0514/13/9S7986F300051CA1.html#p=9RGQDGGH0AI90005");
-// httpResponse = httpClient.execute(downloader.getHttpUriRequest(request, site, null));
-// charset = downloader.getHtmlCharset(httpResponse);
-// assertEquals(charset, "GBK");
- } catch (Exception e) {
- e.printStackTrace();
- }
+ // encoding in meta
+ Request requestUTF_8 = new Request("http://preshing.com/");
+ httpResponse = httpClient.execute(downloader.getHttpUriRequest(requestUTF_8, site, null));
+ charset = downloader.getHtmlCharset(httpResponse);
+ assertEquals(charset, "utf-8");
}
}