diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpUriRequestConverter.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpUriRequestConverter.java index af2fef46..28a7ce5e 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpUriRequestConverter.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpUriRequestConverter.java @@ -58,7 +58,7 @@ public class HttpUriRequestConverter { } private HttpUriRequest convertHttpUriRequest(Request request, Site site, Proxy proxy) { - RequestBuilder requestBuilder = selectRequestMethod(request).setUri(request.getUrl()); + RequestBuilder requestBuilder = selectRequestMethod(request).setUri(UrlUtils.fixIllegalCharacterInUrl(request.getUrl())); if (site.getHeaders() != null) { for (Map.Entry headerEntry : site.getHeaders().entrySet()) { requestBuilder.addHeader(headerEntry.getKey(), headerEntry.getValue()); diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/utils/UrlUtils.java b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/UrlUtils.java index 87a6a567..c61483a3 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/utils/UrlUtils.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/UrlUtils.java @@ -43,7 +43,7 @@ public class UrlUtils { if (url.startsWith("?")) url = base.getPath() + url; URL abs = new URL(base, url); - return encodeIllegalCharacterInUrl(abs.toExternalForm()); + return abs.toExternalForm(); } catch (MalformedURLException e) { return ""; } @@ -53,12 +53,17 @@ public class UrlUtils { * * @param url url * @return new url + * @deprecated */ public static String encodeIllegalCharacterInUrl(String url) { - //TODO more charator support return url.replace(" ", "%20"); } + public static String fixIllegalCharacterInUrl(String url) { + //TODO more charator support + return url.replace(" ", "%20").replaceAll("#+", "#"); + } + public static String getHost(String url) { String host = url; int i = StringUtils.ordinalIndexOf(url, "/", 3); diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpUriRequestConverterTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpUriRequestConverterTest.java new file mode 100644 index 00000000..15902e86 --- /dev/null +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpUriRequestConverterTest.java @@ -0,0 +1,31 @@ +package us.codecraft.webmagic.downloader; + +import org.junit.Test; +import us.codecraft.webmagic.Request; +import us.codecraft.webmagic.Site; +import us.codecraft.webmagic.utils.UrlUtils; + +import java.net.URI; + +import static org.assertj.core.api.Assertions.assertThat; + +/** + * @author code4crafter@gmail.com + * Date: 2017/7/22 + * Time: 下午5:29 + */ +public class HttpUriRequestConverterTest { + + @Test(expected = IllegalArgumentException.class) + public void test_illegal_uri() throws Exception { + HttpUriRequestConverter httpUriRequestConverter = new HttpUriRequestConverter(); + httpUriRequestConverter.convert(new Request("http://bj.zhongkao.com/beikao/yimo/##"), Site.me(), null); + } + + @Test + public void test_illegal_uri_correct() throws Exception { + HttpUriRequestConverter httpUriRequestConverter = new HttpUriRequestConverter(); + HttpClientRequestContext requestContext = httpUriRequestConverter.convert(new Request(UrlUtils.fixIllegalCharacterInUrl("http://bj.zhongkao.com/beikao/yimo/##")), Site.me(), null); + assertThat(requestContext.getHttpUriRequest().getURI()).isEqualTo(new URI("http://bj.zhongkao.com/beikao/yimo/#")); + } +} \ No newline at end of file