From ce3f0ac23968acee622017220f67d2d0874a477b Mon Sep 17 00:00:00 2001 From: GZhY Date: Sun, 9 Apr 2017 21:01:32 +0800 Subject: [PATCH] =?UTF-8?q?=E5=88=A0=E9=99=A4=20fixAllRelativeHrefs=20?= =?UTF-8?q?=E5=B9=B6=E4=BF=AE=E5=A4=8D=20SeleniumDownloader=20=E5=AF=B9=20?= =?UTF-8?q?fixAllRelativeHrefs=20=E7=9A=84=E4=BE=9D=E8=B5=96?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../us/codecraft/webmagic/utils/UrlUtils.java | 35 ------------------- .../webmagic/utils/UrlUtilsTest.java | 19 ---------- .../selenium/SeleniumDownloader.java | 5 +-- 3 files changed, 1 insertion(+), 58 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/utils/UrlUtils.java b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/UrlUtils.java index 72a9d3f8..68646066 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/utils/UrlUtils.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/UrlUtils.java @@ -92,41 +92,6 @@ public class UrlUtils { } } - /** - * allow blank space in quote - */ - private static Pattern patternForHrefWithQuote = Pattern.compile("(]*href=)[\"']([^\"'<>]*)[\"']", Pattern.CASE_INSENSITIVE); - - /** - * disallow blank space without quote - */ - private static Pattern patternForHrefWithoutQuote = Pattern.compile("(]*href=)([^\"'<>\\s]+)", Pattern.CASE_INSENSITIVE); - - public static String fixAllRelativeHrefs(String html, String url) { - html = replaceByPattern(html, url, patternForHrefWithQuote); - html = replaceByPattern(html, url, patternForHrefWithoutQuote); - return html; - } - - public static String replaceByPattern(String html, String url, Pattern pattern) { - StringBuilder stringBuilder = new StringBuilder(); - Matcher matcher = pattern.matcher(html); - int lastEnd = 0; - boolean modified = false; - while (matcher.find()) { - modified = true; - stringBuilder.append(StringUtils.substring(html, lastEnd, matcher.start())); - stringBuilder.append(matcher.group(1)); - stringBuilder.append("\"").append(canonicalizeUrl(matcher.group(2), url)).append("\""); - lastEnd = matcher.end(); - } - if (!modified) { - return html; - } - stringBuilder.append(StringUtils.substring(html, lastEnd)); - return stringBuilder.toString(); - } - public static List convertToRequests(Collection urls) { List requestList = new ArrayList(urls.size()); for (String url : urls) { diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/utils/UrlUtilsTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/utils/UrlUtilsTest.java index a90304dc..6afdeefe 100644 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/utils/UrlUtilsTest.java +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/utils/UrlUtilsTest.java @@ -33,25 +33,6 @@ public class UrlUtilsTest { assertThat(absoluteUrl).isEqualTo("http://www.dianping.com/aa"); } - @Test - public void testFixAllRelativeHrefs() { - String originHtml = ""; - String replacedHtml = UrlUtils.fixAllRelativeHrefs(originHtml, "http://www.dianping.com/"); - assertThat(replacedHtml).isEqualTo(""); - - originHtml = ""; - replacedHtml = UrlUtils.fixAllRelativeHrefs(originHtml, "http://www.dianping.com/"); - assertThat(replacedHtml).isEqualTo(""); - - originHtml = ""; - replacedHtml = UrlUtils.fixAllRelativeHrefs(originHtml, "http://www.dianping.com/"); - assertThat(replacedHtml).isEqualTo(""); - - originHtml = ""; - replacedHtml = UrlUtils.fixAllRelativeHrefs(originHtml, "http://www.dianping.com/"); - assertThat(replacedHtml).isEqualTo(""); - } - @Test public void testGetDomain(){ String url = "http://www.dianping.com/aa/"; diff --git a/webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/SeleniumDownloader.java b/webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/SeleniumDownloader.java index 6e350aad..f45f7e2a 100644 --- a/webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/SeleniumDownloader.java +++ b/webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/SeleniumDownloader.java @@ -5,7 +5,6 @@ import org.openqa.selenium.By; import org.openqa.selenium.Cookie; import org.openqa.selenium.WebDriver; import org.openqa.selenium.WebElement; - import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Request; import us.codecraft.webmagic.Site; @@ -13,7 +12,6 @@ import us.codecraft.webmagic.Task; import us.codecraft.webmagic.downloader.Downloader; import us.codecraft.webmagic.selector.Html; import us.codecraft.webmagic.selector.PlainText; -import us.codecraft.webmagic.utils.UrlUtils; import java.io.Closeable; import java.io.IOException; @@ -108,8 +106,7 @@ public class SeleniumDownloader implements Downloader, Closeable { String content = webElement.getAttribute("outerHTML"); Page page = new Page(); page.setRawText(content); - page.setHtml(new Html(UrlUtils.fixAllRelativeHrefs(content, - request.getUrl()))); + page.setHtml(new Html(content, request.getUrl())); page.setUrl(new PlainText(request.getUrl())); page.setRequest(request); webDriverPool.returnToPool(webDriver);