#613 add charset to page

pull/638/head
yihua.huang 8 years ago
parent 65049baca4
commit 32f1f2cf44

@ -113,7 +113,11 @@ public class HttpClientDownloader extends AbstractDownloader {
Page page = new Page();
page.setBytes(bytes);
if (!request.isBinaryContent()){
page.setRawText(getResponseContent(charset, contentType, bytes));
if (charset == null) {
charset = getHtmlCharset(contentType, bytes);
}
page.setCharset(charset);
page.setRawText(new String(bytes, charset));
}
page.setUrl(new PlainText(request.getUrl()));
page.setRequest(request);
@ -125,21 +129,12 @@ public class HttpClientDownloader extends AbstractDownloader {
return page;
}
private String getResponseContent(String charset, String contentType, byte[] bytes) throws IOException {
private String getHtmlCharset(String contentType, byte[] contentBytes) throws IOException {
String charset = CharsetUtils.detectCharset(contentType, contentBytes);
if (charset == null) {
String htmlCharset = getHtmlCharset(contentType, bytes);
if (htmlCharset != null) {
return new String(bytes, htmlCharset);
} else {
logger.warn("Charset autodetect failed, use {} as charset. Please specify charset in Site.setCharset()", Charset.defaultCharset());
return new String(bytes);
}
} else {
return new String(bytes, charset);
charset = Charset.defaultCharset().name();
logger.warn("Charset autodetect failed, use {} as charset. Please specify charset in Site.setCharset()", Charset.defaultCharset());
}
}
private String getHtmlCharset(String contentType, byte[] contentBytes) throws IOException {
return CharsetUtils.detectCharset(contentType, contentBytes);
return charset;
}
}

Loading…
Cancel
Save