From 6f5b9e448e022ad4e72bc5a1e60a2bb71d422a37 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Sat, 29 Jul 2017 11:27:56 +0800 Subject: [PATCH] #627 set charset to request --- .../java/us/codecraft/webmagic/Request.java | 10 ++++++ .../downloader/HttpClientDownloader.java | 2 +- .../downloader/HttpClientDownloaderTest.java | 33 +++++++++++++++++++ 3 files changed, 44 insertions(+), 1 deletion(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java index 9d0b9ccf..938f0e87 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java @@ -51,6 +51,8 @@ public class Request implements Serializable { */ private boolean binaryContent = false; + private String charset; + public Request() { } @@ -176,6 +178,14 @@ public class Request implements Serializable { this.binaryContent = binaryContent; } + public String getCharset() { + return charset; + } + + public void setCharset(String charset) { + this.charset = charset; + } + @Override public String toString() { return "Request{" + diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java index 4e19e7cc..fff7c7cf 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java @@ -83,7 +83,7 @@ public class HttpClientDownloader extends AbstractDownloader { Page page = Page.fail(); try { httpResponse = httpClient.execute(requestContext.getHttpUriRequest(), requestContext.getHttpClientContext()); - page = handleResponse(request, task.getSite().getCharset(), httpResponse, task); + page = handleResponse(request, request.getCharset() != null ? request.getCharset() : task.getSite().getCharset(), httpResponse, task); onSuccess(request); logger.info("downloading page success {}", request.getUrl()); return page; diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpClientDownloaderTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpClientDownloaderTest.java index 6a1c8319..04a45a02 100644 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpClientDownloaderTest.java +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpClientDownloaderTest.java @@ -289,4 +289,37 @@ public class HttpClientDownloaderTest { }); } + @Test + public void test_download_set_charset() throws Exception { + HttpServer server = httpServer(13423); + server.response(header("Content-Type","text/html; charset=utf-8")).response("hello world!"); + Runner.running(server, new Runnable() { + @Override + public void run() throws Exception { + final HttpClientDownloader httpClientDownloader = new HttpClientDownloader(); + Request request = new Request(); + request.setUrl("http://127.0.0.1:13423/"); + Page page = httpClientDownloader.download(request, Site.me().toTask()); + assertThat(page.getCharset()).isEqualTo("utf-8"); + } + }); + } + + @Test + public void test_download_set_request_charset() throws Exception { + HttpServer server = httpServer(13423); + server.response("hello world!"); + Runner.running(server, new Runnable() { + @Override + public void run() throws Exception { + final HttpClientDownloader httpClientDownloader = new HttpClientDownloader(); + Request request = new Request(); + request.setCharset("utf-8"); + request.setUrl("http://127.0.0.1:13423/"); + Page page = httpClientDownloader.download(request, Site.me().setCharset("gbk").toTask()); + assertThat(page.getCharset()).isEqualTo("utf-8"); + } + }); + } + }