|
|
|
@ -92,41 +92,6 @@ public class UrlUtils {
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* allow blank space in quote
|
|
|
|
|
*/
|
|
|
|
|
private static Pattern patternForHrefWithQuote = Pattern.compile("(<a[^<>]*href=)[\"']([^\"'<>]*)[\"']", Pattern.CASE_INSENSITIVE);
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* disallow blank space without quote
|
|
|
|
|
*/
|
|
|
|
|
private static Pattern patternForHrefWithoutQuote = Pattern.compile("(<a[^<>]*href=)([^\"'<>\\s]+)", Pattern.CASE_INSENSITIVE);
|
|
|
|
|
|
|
|
|
|
public static String fixAllRelativeHrefs(String html, String url) {
|
|
|
|
|
html = replaceByPattern(html, url, patternForHrefWithQuote);
|
|
|
|
|
html = replaceByPattern(html, url, patternForHrefWithoutQuote);
|
|
|
|
|
return html;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public static String replaceByPattern(String html, String url, Pattern pattern) {
|
|
|
|
|
StringBuilder stringBuilder = new StringBuilder();
|
|
|
|
|
Matcher matcher = pattern.matcher(html);
|
|
|
|
|
int lastEnd = 0;
|
|
|
|
|
boolean modified = false;
|
|
|
|
|
while (matcher.find()) {
|
|
|
|
|
modified = true;
|
|
|
|
|
stringBuilder.append(StringUtils.substring(html, lastEnd, matcher.start()));
|
|
|
|
|
stringBuilder.append(matcher.group(1));
|
|
|
|
|
stringBuilder.append("\"").append(canonicalizeUrl(matcher.group(2), url)).append("\"");
|
|
|
|
|
lastEnd = matcher.end();
|
|
|
|
|
}
|
|
|
|
|
if (!modified) {
|
|
|
|
|
return html;
|
|
|
|
|
}
|
|
|
|
|
stringBuilder.append(StringUtils.substring(html, lastEnd));
|
|
|
|
|
return stringBuilder.toString();
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public static List<Request> convertToRequests(Collection<String> urls) {
|
|
|
|
|
List<Request> requestList = new ArrayList<Request>(urls.size());
|
|
|
|
|
for (String url : urls) {
|
|
|
|
|