From 8b8f535c309658c3e33c1a2e53b61d1fce13651e Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Sat, 11 Mar 2017 10:43:10 +0800 Subject: [PATCH] refactor:extract charset detect to utils --- .../downloader/HttpClientDownloader.java | 43 +------------ .../webmagic/utils/CharsetUtils.java | 61 +++++++++++++++++++ 2 files changed, 63 insertions(+), 41 deletions(-) create mode 100644 webmagic-core/src/main/java/us/codecraft/webmagic/utils/CharsetUtils.java diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java index e57d5cd0..ca35867c 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java @@ -1,7 +1,6 @@ package us.codecraft.webmagic.downloader; import org.apache.commons.io.IOUtils; -import org.apache.commons.lang3.StringUtils; import org.apache.http.HttpHost; import org.apache.http.HttpResponse; import org.apache.http.NameValuePair; @@ -13,10 +12,6 @@ import org.apache.http.client.methods.HttpUriRequest; import org.apache.http.client.methods.RequestBuilder; import org.apache.http.impl.client.CloseableHttpClient; import org.apache.http.util.EntityUtils; -import org.jsoup.Jsoup; -import org.jsoup.nodes.Document; -import org.jsoup.nodes.Element; -import org.jsoup.select.Elements; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import us.codecraft.webmagic.Page; @@ -25,8 +20,8 @@ import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Task; import us.codecraft.webmagic.proxy.Proxy; import us.codecraft.webmagic.selector.PlainText; +import us.codecraft.webmagic.utils.CharsetUtils; import us.codecraft.webmagic.utils.HttpConstant; -import us.codecraft.webmagic.utils.UrlUtils; import us.codecraft.webmagic.utils.WMCollections; import java.io.IOException; @@ -213,40 +208,6 @@ public class HttpClientDownloader extends AbstractDownloader { } protected String getHtmlCharset(HttpResponse httpResponse, byte[] contentBytes) throws IOException { - String charset; - // charset - // 1、encoding in http header Content-Type - String value = httpResponse.getEntity().getContentType().getValue(); - charset = UrlUtils.getCharset(value); - if (StringUtils.isNotBlank(charset)) { - logger.debug("Auto get charset: {}", charset); - return charset; - } - // use default charset to decode first time - Charset defaultCharset = Charset.defaultCharset(); - String content = new String(contentBytes, defaultCharset.name()); - // 2、charset in meta - if (StringUtils.isNotEmpty(content)) { - Document document = Jsoup.parse(content); - Elements links = document.select("meta"); - for (Element link : links) { - // 2.1、html4.01 - String metaContent = link.attr("content"); - String metaCharset = link.attr("charset"); - if (metaContent.indexOf("charset") != -1) { - metaContent = metaContent.substring(metaContent.indexOf("charset"), metaContent.length()); - charset = metaContent.split("=")[1]; - break; - } - // 2.2、html5 - else if (StringUtils.isNotEmpty(metaCharset)) { - charset = metaCharset; - break; - } - } - } - logger.debug("Auto get charset: {}", charset); - // 3、todo use tools as cpdetector for content decode - return charset; + return CharsetUtils.detectCharset(httpResponse.getEntity().getContentType().getValue(), contentBytes); } } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/utils/CharsetUtils.java b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/CharsetUtils.java new file mode 100644 index 00000000..50b4f1b6 --- /dev/null +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/CharsetUtils.java @@ -0,0 +1,61 @@ +package us.codecraft.webmagic.utils; + +import org.apache.commons.lang3.StringUtils; +import org.jsoup.Jsoup; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.jsoup.select.Elements; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.nio.charset.Charset; + +/** + * @author code4crafter@gmail.com + * Date: 17/3/11 + * Time: 10:36 + * @since 0.6.2 + */ +public abstract class CharsetUtils { + + private static Logger logger = LoggerFactory.getLogger(CharsetUtils.class); + + public static String detectCharset(String contentType, byte[] contentBytes) throws IOException { + String charset; + // charset + // 1、encoding in http header Content-Type + charset = UrlUtils.getCharset(contentType); + if (StringUtils.isNotBlank(contentType)) { + logger.debug("Auto get charset: {}", charset); + return charset; + } + // use default charset to decode first time + Charset defaultCharset = Charset.defaultCharset(); + String content = new String(contentBytes, defaultCharset); + // 2、charset in meta + if (StringUtils.isNotEmpty(content)) { + Document document = Jsoup.parse(content); + Elements links = document.select("meta"); + for (Element link : links) { + // 2.1、html4.01 + String metaContent = link.attr("content"); + String metaCharset = link.attr("charset"); + if (metaContent.indexOf("charset") != -1) { + metaContent = metaContent.substring(metaContent.indexOf("charset"), metaContent.length()); + charset = metaContent.split("=")[1]; + break; + } + // 2.2、html5 + else if (StringUtils.isNotEmpty(metaCharset)) { + charset = metaCharset; + break; + } + } + } + logger.debug("Auto get charset: {}", charset); + // 3、todo use tools as cpdetector for content decode + return charset; + } + +}