diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java index 11ba1c58..5d2af734 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java @@ -1,7 +1,6 @@ package us.codecraft.webmagic.downloader; import com.google.common.collect.Sets; -import info.monitorenter.cpdetector.io.CodepageDetectorProxy; import org.apache.commons.io.IOUtils; import org.apache.commons.lang.StringUtils; import org.apache.http.HttpResponse; @@ -24,12 +23,11 @@ import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Request; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Task; -import us.codecraft.webmagic.utils.HttpConstant; import us.codecraft.webmagic.selector.PlainText; +import us.codecraft.webmagic.utils.HttpConstant; import us.codecraft.webmagic.utils.UrlUtils; import java.io.IOException; -import java.net.URL; import java.util.HashMap; import java.util.Map; import java.util.Set; @@ -122,44 +120,33 @@ public class HttpClientDownloader extends AbstractDownloader { } } - private static CodepageDetectorProxy detector = CodepageDetectorProxy.getInstance(); protected String getHtmlCharset(CloseableHttpResponse httpResponse) throws IOException { - // 1、head头部包含编码集 + // 1、encoding in http header Content-Type String value = httpResponse.getEntity().getContentType().getValue(); String charset = UrlUtils.getCharset(value); - if(StringUtils.isEmpty(charset)) { - // 2、meta元素中包含编码集 + if (StringUtils.isEmpty(charset)) { + // 2、charset in meta String content = IOUtils.toString(httpResponse.getEntity().getContent()); - if(StringUtils.isNotEmpty(content)) { + if (StringUtils.isNotEmpty(content)) { Document document = Jsoup.parse(content); Elements links = document.select("meta"); - for(Element link : links) { - // 2.1、处理场景: + for (Element link : links) { + // 2.1、 String metaContent = link.attr("content"); String metaCharset = link.attr("charset"); - if(metaContent.indexOf("charset") != -1) { + if (metaContent.indexOf("charset") != -1) { metaContent = metaContent.substring(metaContent.indexOf("charset"), metaContent.length()); charset = metaContent.split("=")[1]; break; } - // 2.2、处理场景: - else if(StringUtils.isNotEmpty(metaCharset)) { + // 2.2、 + else if (StringUtils.isNotEmpty(metaCharset)) { charset = metaCharset; break; } } - - // 3、以上两种都不包含的场景 - if(StringUtils.isEmpty(charset)) { - java.nio.charset.Charset nioCharset = null; - try { - nioCharset = detector.detectCodepage(httpResponse.getEntity().getContent(), content.length()); - charset = nioCharset.name(); - } catch (IOException e) { - // ignore - } - } + // 3、todo use tools as cpdetector for content decode } } return charset; diff --git a/webmagic-core/src/main/lib/antlr-2.7.4.jar b/webmagic-core/src/main/lib/antlr-2.7.4.jar deleted file mode 100644 index 45e45b5c..00000000 Binary files a/webmagic-core/src/main/lib/antlr-2.7.4.jar and /dev/null differ diff --git a/webmagic-core/src/main/lib/cpdetector_1.0.10.jar b/webmagic-core/src/main/lib/cpdetector_1.0.10.jar deleted file mode 100644 index 47329f22..00000000 Binary files a/webmagic-core/src/main/lib/cpdetector_1.0.10.jar and /dev/null differ diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpClientDownloaderTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpClientDownloaderTest.java index 072de135..09855a04 100644 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpClientDownloaderTest.java +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpClientDownloaderTest.java @@ -1,7 +1,6 @@ package us.codecraft.webmagic.downloader; import org.apache.http.client.methods.CloseableHttpResponse; -import org.apache.http.client.methods.HttpUriRequest; import org.apache.http.impl.client.CloseableHttpClient; import org.junit.Ignore; import org.junit.Test; @@ -11,6 +10,7 @@ import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Task; import us.codecraft.webmagic.selector.Html; +import java.io.IOException; import java.io.UnsupportedEncodingException; import static org.assertj.core.api.Assertions.assertThat; @@ -57,29 +57,20 @@ public class HttpClientDownloaderTest { } @Test - public void testGetHtmlCharset() { + public void testGetHtmlCharset() throws IOException { HttpClientDownloader downloader = new HttpClientDownloader(); Site site = Site.me(); CloseableHttpClient httpClient = new HttpClientGenerator().getClient(site); - try { - // 头部包含编码 - Request requestGBK = new Request("http://sports.163.com/14/0514/13/9S7986F300051CA1.html#p=9RGQDGGH0AI90005"); - CloseableHttpResponse httpResponse = httpClient.execute(downloader.getHttpUriRequest(requestGBK, site, null)); - String charset = downloader.getHtmlCharset(httpResponse); - assertEquals(charset, "GBK"); + // encoding in http header Content-Type + Request requestGBK = new Request("http://sports.163.com/14/0514/13/9S7986F300051CA1.html#p=9RGQDGGH0AI90005"); + CloseableHttpResponse httpResponse = httpClient.execute(downloader.getHttpUriRequest(requestGBK, site, null)); + String charset = downloader.getHtmlCharset(httpResponse); + assertEquals(charset, "GBK"); - // meta包含编码 - Request requestUTF_8 = new Request("http://preshing.com/"); - httpResponse = httpClient.execute(downloader.getHttpUriRequest(requestUTF_8, site, null)); - charset = downloader.getHtmlCharset(httpResponse); - assertEquals(charset, "utf-8"); - -// Request request = new Request("http://sports.163.com/14/0514/13/9S7986F300051CA1.html#p=9RGQDGGH0AI90005"); -// httpResponse = httpClient.execute(downloader.getHttpUriRequest(request, site, null)); -// charset = downloader.getHtmlCharset(httpResponse); -// assertEquals(charset, "GBK"); - } catch (Exception e) { - e.printStackTrace(); - } + // encoding in meta + Request requestUTF_8 = new Request("http://preshing.com/"); + httpResponse = httpClient.execute(downloader.getHttpUriRequest(requestUTF_8, site, null)); + charset = downloader.getHtmlCharset(httpResponse); + assertEquals(charset, "utf-8"); } }