From d2e0f0cd33a957af5eedb62485ad745abed40af7 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Fri, 6 Sep 2013 21:35:23 +0800 Subject: [PATCH] #25 use URL api in UrlUtils.canonicalizeUrl() --- .../us/codecraft/webmagic/utils/UrlUtils.java | 52 +++++++------------ .../webmagic/utils/UrlUtilsTest.java | 7 ++- 2 files changed, 23 insertions(+), 36 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/utils/UrlUtils.java b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/UrlUtils.java index 4e1140b4..4e5f67fc 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/utils/UrlUtils.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/UrlUtils.java @@ -2,6 +2,8 @@ package us.codecraft.webmagic.utils; import org.apache.commons.lang3.StringUtils; +import java.net.MalformedURLException; +import java.net.URL; import java.nio.charset.Charset; import java.util.regex.Matcher; import java.util.regex.Pattern; @@ -18,47 +20,33 @@ public class UrlUtils { /** * canonicalizeUrl + * + * Borrowed from Jsoup. + * * @param url * @param refer * @return canonicalizeUrl */ public static String canonicalizeUrl(String url, String refer) { - if (StringUtils.isBlank(url) || StringUtils.isBlank(refer)) { - return url; - } - if (url.startsWith("http") || url.startsWith("ftp") || url.startsWith("mailto") || url.startsWith("javascript:")) { - return url; - } - if (StringUtils.startsWith(url, "/")) { - String host = getHost(refer); - return host + url; - } else if (!StringUtils.startsWith(url, ".")) { - refer = reversePath(refer, 1); - return refer + "/" + url; - } else { - Matcher matcher = relativePathPattern.matcher(url); - if (matcher.find()) { - int reverseDepth = matcher.group(1).length(); - refer = reversePath(refer, reverseDepth); - String substring = StringUtils.substring(url, matcher.end()); - return refer + "/" + substring; - } else { - refer = reversePath(refer, 1); - return refer + "/" + url; + URL base; + try { + try { + base = new URL(refer); + } catch (MalformedURLException e) { + // the base is unsuitable, but the attribute may be abs on its own, so try that + URL abs = new URL(refer); + return abs.toExternalForm(); } + // workaround: java resolves '//path/file + ?foo' to '//path/?foo', not '//path/file?foo' as desired + if (url.startsWith("?")) + url = base.getPath() + url; + URL abs = new URL(base, url); + return abs.toExternalForm(); + } catch (MalformedURLException e) { + return ""; } } - public static String reversePath(String url, int depth) { - int i = StringUtils.lastOrdinalIndexOf(url, "/", depth); - if (i < 10) { - url = getHost(url); - } else { - url = StringUtils.substring(url, 0, i); - } - return url; - } - public static String getHost(String url) { String host = url; int i = StringUtils.ordinalIndexOf(url, "/", 3); diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/utils/UrlUtilsTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/utils/UrlUtilsTest.java index d1cbc21e..abe6adcc 100644 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/utils/UrlUtilsTest.java +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/utils/UrlUtilsTest.java @@ -19,13 +19,12 @@ public class UrlUtilsTest { fixrelativeurl = UrlUtils.canonicalizeUrl("../aa", "http://www.dianping.com/sh/ss/com"); Assert.assertEquals("http://www.dianping.com/sh/aa", fixrelativeurl); - fixrelativeurl = UrlUtils.canonicalizeUrl("..../aa", "http://www.dianping.com/sh/ss/com"); - Assert.assertEquals("http://www.dianping.com/aa", fixrelativeurl); - fixrelativeurl = UrlUtils.canonicalizeUrl(".../aa", "http://www.dianping.com/sh/ss/com"); - Assert.assertEquals("http://www.dianping.com/aa", fixrelativeurl); fixrelativeurl = UrlUtils.canonicalizeUrl("..aa", "http://www.dianping.com/sh/ss/com"); Assert.assertEquals("http://www.dianping.com/sh/ss/..aa", fixrelativeurl); + fixrelativeurl = UrlUtils.canonicalizeUrl("../../aa", "http://www.dianping.com/sh/ss/com/"); + Assert.assertEquals("http://www.dianping.com/sh/aa", fixrelativeurl); fixrelativeurl = UrlUtils.canonicalizeUrl("../../aa", "http://www.dianping.com/sh/ss/com"); + Assert.assertEquals("http://www.dianping.com/aa", fixrelativeurl); } @Test