From 3c3f0011869f58fbb7524ebcade491236e79d1c5 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Sat, 8 Jun 2013 20:29:53 +0800 Subject: [PATCH] fix charset bug --- .../codecraft/spider/downloader/HttpClientDownloader.java | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/main/java/us/codecraft/spider/downloader/HttpClientDownloader.java b/src/main/java/us/codecraft/spider/downloader/HttpClientDownloader.java index fc6a1073..c817fb6c 100644 --- a/src/main/java/us/codecraft/spider/downloader/HttpClientDownloader.java +++ b/src/main/java/us/codecraft/spider/downloader/HttpClientDownloader.java @@ -31,8 +31,12 @@ public class HttpClientDownloader implements Downloader { HttpResponse httpResponse = httpClient.execute(httpGet); int statusCode = httpResponse.getStatusLine().getStatusCode(); if (site.getAcceptStatCode().contains(statusCode)) { + if (site.getEncoding() == null){ + String value = httpResponse.getEntity().getContentType().getValue(); + site.setEncoding(new PlainText(value).r("charset=([^\\s]+)").toString()); + } String content = IOUtils.toString(httpResponse.getEntity().getContent(), - site.getEncoding() == null ? httpResponse.getEntity().getContentType().getValue() : site.getEncoding()); + site.getEncoding()); Page page = new Page(); page.setHtml(new Html(UrlUtils.fixAllRelativeHrefs(content, request.getUrl()))); page.setUrl(new PlainText(request.getUrl()));