|
|
@ -2,6 +2,8 @@ package us.codecraft.webmagic.utils;
|
|
|
|
|
|
|
|
|
|
|
|
import org.apache.commons.lang3.StringUtils;
|
|
|
|
import org.apache.commons.lang3.StringUtils;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import java.net.MalformedURLException;
|
|
|
|
|
|
|
|
import java.net.URL;
|
|
|
|
import java.nio.charset.Charset;
|
|
|
|
import java.nio.charset.Charset;
|
|
|
|
import java.util.regex.Matcher;
|
|
|
|
import java.util.regex.Matcher;
|
|
|
|
import java.util.regex.Pattern;
|
|
|
|
import java.util.regex.Pattern;
|
|
|
@ -18,47 +20,33 @@ public class UrlUtils {
|
|
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
/**
|
|
|
|
* canonicalizeUrl
|
|
|
|
* canonicalizeUrl
|
|
|
|
|
|
|
|
*
|
|
|
|
|
|
|
|
* Borrowed from Jsoup.
|
|
|
|
|
|
|
|
*
|
|
|
|
* @param url
|
|
|
|
* @param url
|
|
|
|
* @param refer
|
|
|
|
* @param refer
|
|
|
|
* @return canonicalizeUrl
|
|
|
|
* @return canonicalizeUrl
|
|
|
|
*/
|
|
|
|
*/
|
|
|
|
public static String canonicalizeUrl(String url, String refer) {
|
|
|
|
public static String canonicalizeUrl(String url, String refer) {
|
|
|
|
if (StringUtils.isBlank(url) || StringUtils.isBlank(refer)) {
|
|
|
|
URL base;
|
|
|
|
return url;
|
|
|
|
try {
|
|
|
|
}
|
|
|
|
try {
|
|
|
|
if (url.startsWith("http") || url.startsWith("ftp") || url.startsWith("mailto") || url.startsWith("javascript:")) {
|
|
|
|
base = new URL(refer);
|
|
|
|
return url;
|
|
|
|
} catch (MalformedURLException e) {
|
|
|
|
}
|
|
|
|
// the base is unsuitable, but the attribute may be abs on its own, so try that
|
|
|
|
if (StringUtils.startsWith(url, "/")) {
|
|
|
|
URL abs = new URL(refer);
|
|
|
|
String host = getHost(refer);
|
|
|
|
return abs.toExternalForm();
|
|
|
|
return host + url;
|
|
|
|
|
|
|
|
} else if (!StringUtils.startsWith(url, ".")) {
|
|
|
|
|
|
|
|
refer = reversePath(refer, 1);
|
|
|
|
|
|
|
|
return refer + "/" + url;
|
|
|
|
|
|
|
|
} else {
|
|
|
|
|
|
|
|
Matcher matcher = relativePathPattern.matcher(url);
|
|
|
|
|
|
|
|
if (matcher.find()) {
|
|
|
|
|
|
|
|
int reverseDepth = matcher.group(1).length();
|
|
|
|
|
|
|
|
refer = reversePath(refer, reverseDepth);
|
|
|
|
|
|
|
|
String substring = StringUtils.substring(url, matcher.end());
|
|
|
|
|
|
|
|
return refer + "/" + substring;
|
|
|
|
|
|
|
|
} else {
|
|
|
|
|
|
|
|
refer = reversePath(refer, 1);
|
|
|
|
|
|
|
|
return refer + "/" + url;
|
|
|
|
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// workaround: java resolves '//path/file + ?foo' to '//path/?foo', not '//path/file?foo' as desired
|
|
|
|
|
|
|
|
if (url.startsWith("?"))
|
|
|
|
|
|
|
|
url = base.getPath() + url;
|
|
|
|
|
|
|
|
URL abs = new URL(base, url);
|
|
|
|
|
|
|
|
return abs.toExternalForm();
|
|
|
|
|
|
|
|
} catch (MalformedURLException e) {
|
|
|
|
|
|
|
|
return "";
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
public static String reversePath(String url, int depth) {
|
|
|
|
|
|
|
|
int i = StringUtils.lastOrdinalIndexOf(url, "/", depth);
|
|
|
|
|
|
|
|
if (i < 10) {
|
|
|
|
|
|
|
|
url = getHost(url);
|
|
|
|
|
|
|
|
} else {
|
|
|
|
|
|
|
|
url = StringUtils.substring(url, 0, i);
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
return url;
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
public static String getHost(String url) {
|
|
|
|
public static String getHost(String url) {
|
|
|
|
String host = url;
|
|
|
|
String host = url;
|
|
|
|
int i = StringUtils.ordinalIndexOf(url, "/", 3);
|
|
|
|
int i = StringUtils.ordinalIndexOf(url, "/", 3);
|
|
|
|