删除 fixAllRelativeHrefs 并修复 SeleniumDownloader 对 fixAllRelativeHrefs 的依赖

pull/528/head
GZhY 8 years ago
parent bc6e81e00f
commit ce3f0ac239

@ -92,41 +92,6 @@ public class UrlUtils {
} }
} }
/**
* allow blank space in quote
*/
private static Pattern patternForHrefWithQuote = Pattern.compile("(<a[^<>]*href=)[\"']([^\"'<>]*)[\"']", Pattern.CASE_INSENSITIVE);
/**
* disallow blank space without quote
*/
private static Pattern patternForHrefWithoutQuote = Pattern.compile("(<a[^<>]*href=)([^\"'<>\\s]+)", Pattern.CASE_INSENSITIVE);
public static String fixAllRelativeHrefs(String html, String url) {
html = replaceByPattern(html, url, patternForHrefWithQuote);
html = replaceByPattern(html, url, patternForHrefWithoutQuote);
return html;
}
public static String replaceByPattern(String html, String url, Pattern pattern) {
StringBuilder stringBuilder = new StringBuilder();
Matcher matcher = pattern.matcher(html);
int lastEnd = 0;
boolean modified = false;
while (matcher.find()) {
modified = true;
stringBuilder.append(StringUtils.substring(html, lastEnd, matcher.start()));
stringBuilder.append(matcher.group(1));
stringBuilder.append("\"").append(canonicalizeUrl(matcher.group(2), url)).append("\"");
lastEnd = matcher.end();
}
if (!modified) {
return html;
}
stringBuilder.append(StringUtils.substring(html, lastEnd));
return stringBuilder.toString();
}
public static List<Request> convertToRequests(Collection<String> urls) { public static List<Request> convertToRequests(Collection<String> urls) {
List<Request> requestList = new ArrayList<Request>(urls.size()); List<Request> requestList = new ArrayList<Request>(urls.size());
for (String url : urls) { for (String url : urls) {

@ -33,25 +33,6 @@ public class UrlUtilsTest {
assertThat(absoluteUrl).isEqualTo("http://www.dianping.com/aa"); assertThat(absoluteUrl).isEqualTo("http://www.dianping.com/aa");
} }
@Test
public void testFixAllRelativeHrefs() {
String originHtml = "<a href=\"/start\">";
String replacedHtml = UrlUtils.fixAllRelativeHrefs(originHtml, "http://www.dianping.com/");
assertThat(replacedHtml).isEqualTo("<a href=\"http://www.dianping.com/start\">");
originHtml = "<a href=\"/start a\">";
replacedHtml = UrlUtils.fixAllRelativeHrefs(originHtml, "http://www.dianping.com/");
assertThat(replacedHtml).isEqualTo("<a href=\"http://www.dianping.com/start%20a\">");
originHtml = "<a href='/start a'>";
replacedHtml = UrlUtils.fixAllRelativeHrefs(originHtml, "http://www.dianping.com/");
assertThat(replacedHtml).isEqualTo("<a href=\"http://www.dianping.com/start%20a\">");
originHtml = "<a href=/start tag>";
replacedHtml = UrlUtils.fixAllRelativeHrefs(originHtml, "http://www.dianping.com/");
assertThat(replacedHtml).isEqualTo("<a href=\"http://www.dianping.com/start\" tag>");
}
@Test @Test
public void testGetDomain(){ public void testGetDomain(){
String url = "http://www.dianping.com/aa/"; String url = "http://www.dianping.com/aa/";

@ -5,7 +5,6 @@ import org.openqa.selenium.By;
import org.openqa.selenium.Cookie; import org.openqa.selenium.Cookie;
import org.openqa.selenium.WebDriver; import org.openqa.selenium.WebDriver;
import org.openqa.selenium.WebElement; import org.openqa.selenium.WebElement;
import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Request; import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Site;
@ -13,7 +12,6 @@ import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.downloader.Downloader; import us.codecraft.webmagic.downloader.Downloader;
import us.codecraft.webmagic.selector.Html; import us.codecraft.webmagic.selector.Html;
import us.codecraft.webmagic.selector.PlainText; import us.codecraft.webmagic.selector.PlainText;
import us.codecraft.webmagic.utils.UrlUtils;
import java.io.Closeable; import java.io.Closeable;
import java.io.IOException; import java.io.IOException;
@ -108,8 +106,7 @@ public class SeleniumDownloader implements Downloader, Closeable {
String content = webElement.getAttribute("outerHTML"); String content = webElement.getAttribute("outerHTML");
Page page = new Page(); Page page = new Page();
page.setRawText(content); page.setRawText(content);
page.setHtml(new Html(UrlUtils.fixAllRelativeHrefs(content, page.setHtml(new Html(content, request.getUrl()));
request.getUrl())));
page.setUrl(new PlainText(request.getUrl())); page.setUrl(new PlainText(request.getUrl()));
page.setRequest(request); page.setRequest(request);
webDriverPool.returnToPool(webDriver); webDriverPool.returnToPool(webDriver);

Loading…
Cancel
Save