#25 use URL api in UrlUtils.canonicalizeUrl()

pull/88/head
yihua.huang 12 years ago
parent 363fd38ccb
commit d2e0f0cd33

@ -2,6 +2,8 @@ package us.codecraft.webmagic.utils;
import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.StringUtils;
import java.net.MalformedURLException;
import java.net.URL;
import java.nio.charset.Charset; import java.nio.charset.Charset;
import java.util.regex.Matcher; import java.util.regex.Matcher;
import java.util.regex.Pattern; import java.util.regex.Pattern;
@ -18,47 +20,33 @@ public class UrlUtils {
/** /**
* canonicalizeUrl * canonicalizeUrl
*
* Borrowed from Jsoup.
*
* @param url * @param url
* @param refer * @param refer
* @return canonicalizeUrl * @return canonicalizeUrl
*/ */
public static String canonicalizeUrl(String url, String refer) { public static String canonicalizeUrl(String url, String refer) {
if (StringUtils.isBlank(url) || StringUtils.isBlank(refer)) { URL base;
return url; try {
} try {
if (url.startsWith("http") || url.startsWith("ftp") || url.startsWith("mailto") || url.startsWith("javascript:")) { base = new URL(refer);
return url; } catch (MalformedURLException e) {
} // the base is unsuitable, but the attribute may be abs on its own, so try that
if (StringUtils.startsWith(url, "/")) { URL abs = new URL(refer);
String host = getHost(refer); return abs.toExternalForm();
return host + url;
} else if (!StringUtils.startsWith(url, ".")) {
refer = reversePath(refer, 1);
return refer + "/" + url;
} else {
Matcher matcher = relativePathPattern.matcher(url);
if (matcher.find()) {
int reverseDepth = matcher.group(1).length();
refer = reversePath(refer, reverseDepth);
String substring = StringUtils.substring(url, matcher.end());
return refer + "/" + substring;
} else {
refer = reversePath(refer, 1);
return refer + "/" + url;
} }
// workaround: java resolves '//path/file + ?foo' to '//path/?foo', not '//path/file?foo' as desired
if (url.startsWith("?"))
url = base.getPath() + url;
URL abs = new URL(base, url);
return abs.toExternalForm();
} catch (MalformedURLException e) {
return "";
} }
} }
public static String reversePath(String url, int depth) {
int i = StringUtils.lastOrdinalIndexOf(url, "/", depth);
if (i < 10) {
url = getHost(url);
} else {
url = StringUtils.substring(url, 0, i);
}
return url;
}
public static String getHost(String url) { public static String getHost(String url) {
String host = url; String host = url;
int i = StringUtils.ordinalIndexOf(url, "/", 3); int i = StringUtils.ordinalIndexOf(url, "/", 3);

@ -19,13 +19,12 @@ public class UrlUtilsTest {
fixrelativeurl = UrlUtils.canonicalizeUrl("../aa", "http://www.dianping.com/sh/ss/com"); fixrelativeurl = UrlUtils.canonicalizeUrl("../aa", "http://www.dianping.com/sh/ss/com");
Assert.assertEquals("http://www.dianping.com/sh/aa", fixrelativeurl); Assert.assertEquals("http://www.dianping.com/sh/aa", fixrelativeurl);
fixrelativeurl = UrlUtils.canonicalizeUrl("..../aa", "http://www.dianping.com/sh/ss/com");
Assert.assertEquals("http://www.dianping.com/aa", fixrelativeurl);
fixrelativeurl = UrlUtils.canonicalizeUrl(".../aa", "http://www.dianping.com/sh/ss/com");
Assert.assertEquals("http://www.dianping.com/aa", fixrelativeurl);
fixrelativeurl = UrlUtils.canonicalizeUrl("..aa", "http://www.dianping.com/sh/ss/com"); fixrelativeurl = UrlUtils.canonicalizeUrl("..aa", "http://www.dianping.com/sh/ss/com");
Assert.assertEquals("http://www.dianping.com/sh/ss/..aa", fixrelativeurl); Assert.assertEquals("http://www.dianping.com/sh/ss/..aa", fixrelativeurl);
fixrelativeurl = UrlUtils.canonicalizeUrl("../../aa", "http://www.dianping.com/sh/ss/com/");
Assert.assertEquals("http://www.dianping.com/sh/aa", fixrelativeurl);
fixrelativeurl = UrlUtils.canonicalizeUrl("../../aa", "http://www.dianping.com/sh/ss/com"); fixrelativeurl = UrlUtils.canonicalizeUrl("../../aa", "http://www.dianping.com/sh/ss/com");
Assert.assertEquals("http://www.dianping.com/aa", fixrelativeurl);
} }
@Test @Test

Loading…
Cancel
Save