From 41c2ea94984ed19120d9ef0abe2ef1d0b93135fd Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Tue, 27 May 2014 17:34:19 +0800 Subject: [PATCH] refactor of selectable cont' #113 1. remove lazy init of Html 2. rename strings to sourceTexts for better meaning 3. make getSourceTexts abstract and DO NOT always store strings 4. instead store parsed elements of document in HtmlNode --- pom.xml | 2 +- .../webmagic/selector/AbstractSelectable.java | 33 ++--- .../selector/BaseElementSelector.java | 9 +- .../webmagic/selector/CssSelector.java | 11 +- .../us/codecraft/webmagic/selector/Html.java | 132 +++--------------- .../webmagic/selector/HtmlFragment.java | 7 - .../codecraft/webmagic/selector/HtmlNode.java | 97 +++++++++++++ .../us/codecraft/webmagic/selector/Json.java | 19 +-- .../webmagic/selector/PlainText.java | 17 ++- .../webmagic/selector/XpathSelector.java | 10 +- .../downloader/HttpClientDownloaderTest.java | 2 +- 11 files changed, 169 insertions(+), 170 deletions(-) delete mode 100644 webmagic-core/src/main/java/us/codecraft/webmagic/selector/HtmlFragment.java create mode 100644 webmagic-core/src/main/java/us/codecraft/webmagic/selector/HtmlNode.java diff --git a/pom.xml b/pom.xml index de5cf91a..2309a15d 100644 --- a/pom.xml +++ b/pom.xml @@ -88,7 +88,7 @@ us.codecraft xsoup - 0.2.3 + 0.2.4-SNAPSHOT com.alibaba diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/AbstractSelectable.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/AbstractSelectable.java index 2ac4c703..e2bb5521 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/AbstractSelectable.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/AbstractSelectable.java @@ -11,17 +11,7 @@ import java.util.List; */ public abstract class AbstractSelectable implements Selectable { - protected List strings; - - public AbstractSelectable(String text) { - List results = new ArrayList(); - results.add(text); - this.strings = results; - } - - public AbstractSelectable(List strings) { - this.strings = strings; - } + protected abstract List getSourceTexts(); @Override public Selectable css(String selector) { @@ -55,7 +45,7 @@ public abstract class AbstractSelectable implements Selectable { @Override public List all() { - return strings; + return getSourceTexts(); } @Override @@ -74,30 +64,37 @@ public abstract class AbstractSelectable implements Selectable { @Override public Selectable select(Selector selector) { - return select(selector, strings); + return select(selector, getSourceTexts()); } @Override public Selectable selectList(Selector selector) { - return selectList(selector, strings); + return selectList(selector, getSourceTexts()); } @Override public Selectable regex(String regex) { RegexSelector regexSelector = Selectors.regex(regex); - return selectList(regexSelector, strings); + return selectList(regexSelector, getSourceTexts()); } @Override public Selectable regex(String regex, int group) { RegexSelector regexSelector = Selectors.regex(regex, group); - return selectList(regexSelector, strings); + return selectList(regexSelector, getSourceTexts()); } @Override public Selectable replace(String regex, String replacement) { ReplaceSelector replaceSelector = new ReplaceSelector(regex,replacement); - return select(replaceSelector, strings); + return select(replaceSelector, getSourceTexts()); + } + + public String getFirstSourceText() { + if (getSourceTexts() != null && getSourceTexts().size() > 0) { + return getSourceTexts().get(0); + } + return null; } @Override @@ -107,6 +104,6 @@ public abstract class AbstractSelectable implements Selectable { @Override public boolean match() { - return strings != null && strings.size() > 0; + return getSourceTexts() != null && getSourceTexts().size() > 0; } } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/BaseElementSelector.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/BaseElementSelector.java index 3b9b22d3..bbc7217a 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/BaseElementSelector.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/BaseElementSelector.java @@ -2,7 +2,6 @@ package us.codecraft.webmagic.selector; import org.jsoup.Jsoup; import org.jsoup.nodes.Element; -import org.jsoup.select.Elements; import java.util.ArrayList; import java.util.List; @@ -37,16 +36,18 @@ public abstract class BaseElementSelector implements Selector, ElementSelector { return null; } - public Elements selectElements(String text) { + public List selectElements(String text) { if (text != null) { return selectElements(Jsoup.parse(text)); } else { - return new Elements(); + return new ArrayList(); } } public abstract Element selectElement(Element element); - public abstract Elements selectElements(Element element); + public abstract List selectElements(Element element); + + public abstract boolean hasAttribute(); } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/CssSelector.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/CssSelector.java index 095af35a..6a638dbf 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/CssSelector.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/CssSelector.java @@ -57,7 +57,7 @@ public class CssSelector extends BaseElementSelector { @Override public String select(Element element) { - Elements elements = selectElements(element); + List elements = selectElements(element); if (CollectionUtils.isEmpty(elements)) { return null; } @@ -67,7 +67,7 @@ public class CssSelector extends BaseElementSelector { @Override public List selectList(Element doc) { List strings = new ArrayList(); - Elements elements = selectElements(doc); + List elements = selectElements(doc); if (CollectionUtils.isNotEmpty(elements)) { for (Element element : elements) { String value = getValue(element); @@ -89,7 +89,12 @@ public class CssSelector extends BaseElementSelector { } @Override - public Elements selectElements(Element element) { + public List selectElements(Element element) { return element.select(selectorText); } + + @Override + public boolean hasAttribute() { + return attrName != null; + } } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java index 97485773..7b593ed1 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java @@ -2,10 +2,11 @@ package us.codecraft.webmagic.selector; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import java.util.ArrayList; +import java.util.Collections; import java.util.List; /** @@ -14,7 +15,7 @@ import java.util.List; * @author code4crafter@gmail.com
* @since 0.1.0 */ -public class Html extends PlainText { +public class Html extends HtmlNode { private Logger logger = LoggerFactory.getLogger(getClass()); @@ -23,130 +24,26 @@ public class Html extends PlainText { */ private Document document; - private boolean needInitCache = true; - - public Html(List strings) { - super(strings); - } - public Html(String text) { - super(text); - } - - public Html(List strings, boolean needInitCache) { - super(strings); - this.needInitCache = needInitCache; - } - - public Html(String text, boolean needInitCache) { - super(text); - this.needInitCache = needInitCache; - } - - /** - * lazy init - */ - private void initDocument() { - if (this.document == null && needInitCache) { - needInitCache = false; - //just init once whether the parsing succeeds or not - try { - this.document = Jsoup.parse(getText()); - } catch (Exception e) { - logger.warn("parse document error ", e); - } + try { + this.document = Jsoup.parse(text); + } catch (Exception e) { + this.document = null; + logger.warn("parse document error ", e); } } public Html(Document document) { - super(document.html()); this.document = document; } - public static Html create(String text) { - return new Html(text); - } - - @Override - protected Selectable select(Selector selector, List strings) { - initDocument(); - List results = new ArrayList(); - for (String string : strings) { - String result = selector.select(string); - if (result != null) { - results.add(result); - } - } - return new Html(results, false); - } - - @Override - protected Selectable selectList(Selector selector, List strings) { - initDocument(); - List results = new ArrayList(); - for (String string : strings) { - List result = selector.selectList(string); - results.addAll(result); - } - return new Html(results, false); - } - - @Override - public Selectable smartContent() { - initDocument(); - SmartContentSelector smartContentSelector = Selectors.smartContent(); - return select(smartContentSelector, strings); - } - - @Override - public Selectable links() { - return xpath("//a/@href"); - } - - @Override - public Selectable xpath(String xpath) { - XpathSelector xpathSelector = Selectors.xpath(xpath); - if (document != null) { - return new Html(xpathSelector.selectList(document), false); - } - return selectList(xpathSelector, strings); - } - - @Override - public Selectable $(String selector) { - CssSelector cssSelector = Selectors.$(selector); - if (document != null) { - return new Html(cssSelector.selectList(document), false); - } - return selectList(cssSelector, strings); - } - - @Override - public Selectable $(String selector, String attrName) { - CssSelector cssSelector = Selectors.$(selector, attrName); - if (document != null) { - return new Html(cssSelector.selectList(document), false); - } - return selectList(cssSelector, strings); - } - public Document getDocument() { - initDocument(); return document; } - public String getText() { - if (strings != null && strings.size() > 0) { - return strings.get(0); - } - return document.html(); - } - @Override - public List nodes() { - ArrayList selectables = new ArrayList(); - selectables.add(this); - return selectables; + protected List getElements() { + return Collections.singletonList(getDocument()); } /** @@ -158,7 +55,7 @@ public class Html extends PlainText { ElementSelector elementSelector = (ElementSelector) selector; return elementSelector.select(getDocument()); } else { - return selector.select(getText()); + return selector.select(getFirstSourceText()); } } @@ -167,7 +64,12 @@ public class Html extends PlainText { ElementSelector elementSelector = (ElementSelector) selector; return elementSelector.selectList(getDocument()); } else { - return selector.selectList(getText()); + return selector.selectList(getFirstSourceText()); } } + + public static Html create(String text) { + return new Html(text); + } + } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/HtmlFragment.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/HtmlFragment.java deleted file mode 100644 index d427f679..00000000 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/HtmlFragment.java +++ /dev/null @@ -1,7 +0,0 @@ -package us.codecraft.webmagic.selector; - -/** - * @author code4crafer@gmail.com - */ -public class HtmlFragment { -} diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/HtmlNode.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/HtmlNode.java new file mode 100644 index 00000000..3ca7e5c4 --- /dev/null +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/HtmlNode.java @@ -0,0 +1,97 @@ +package us.codecraft.webmagic.selector; + +import org.jsoup.nodes.Element; + +import java.util.ArrayList; +import java.util.List; + +/** + * @author code4crafer@gmail.com + */ +public class HtmlNode extends AbstractSelectable { + + private final List elements; + + public HtmlNode(List elements) { + this.elements = elements; + } + + public HtmlNode() { + elements = null; + } + + protected List getElements() { + return elements; + } + + @Override + public Selectable smartContent() { + SmartContentSelector smartContentSelector = Selectors.smartContent(); + return select(smartContentSelector, getSourceTexts()); + } + + @Override + public Selectable links() { + return xpath("//a/@href"); + } + + @Override + public Selectable xpath(String xpath) { + XpathSelector xpathSelector = Selectors.xpath(xpath); + return selectElements(xpathSelector); + } + + /** + * select elements + * + * @param elementSelector + * @return + */ + protected Selectable selectElements(BaseElementSelector elementSelector) { + if (!elementSelector.hasAttribute()) { + List resultElements = new ArrayList(); + for (Element element : getElements()) { + List selectElements = elementSelector.selectElements(element); + resultElements.addAll(selectElements); + } + return new HtmlNode(resultElements); + } else { + // has attribute, consider as plaintext + List resultStrings = new ArrayList(); + for (Element element : getElements()) { + List selectList = elementSelector.selectList(element); + resultStrings.addAll(selectList); + } + return new PlainText(resultStrings); + + } + } + + @Override + public Selectable $(String selector) { + CssSelector cssSelector = Selectors.$(selector); + return selectElements(cssSelector); + } + + @Override + public Selectable $(String selector, String attrName) { + CssSelector cssSelector = Selectors.$(selector, attrName); + return selectElements(cssSelector); + } + + @Override + public List nodes() { + ArrayList selectables = new ArrayList(); + selectables.add(this); + return selectables; + } + + @Override + protected List getSourceTexts() { + List sourceTexts = new ArrayList(getElements().size()); + for (Element element : getElements()) { + sourceTexts.add(element.toString()); + } + return sourceTexts; + } +} diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Json.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Json.java index 96d1c2b6..4c31eb41 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Json.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Json.java @@ -26,7 +26,7 @@ public class Json extends PlainText { * @return */ public Json removePadding(String padding) { - String text = getText(); + String text = getFirstSourceText(); XTokenQueue tokenQueue = new XTokenQueue(text); tokenQueue.consumeWhitespace(); tokenQueue.consume(padding); @@ -36,29 +36,22 @@ public class Json extends PlainText { } public T toObject(Class clazz) { - if (getText() == null) { + if (getFirstSourceText() == null) { return null; } - return JSON.parseObject(getText(), clazz); + return JSON.parseObject(getFirstSourceText(), clazz); } public List toList(Class clazz) { - if (getText() == null) { + if (getFirstSourceText() == null) { return null; } - return JSON.parseArray(getText(), clazz); - } - - public String getText() { - if (strings != null && strings.size() > 0) { - return strings.get(0); - } - return null; + return JSON.parseArray(getFirstSourceText(), clazz); } @Override public Selectable jsonPath(String jsonPath) { JsonPathSelector jsonPathSelector = new JsonPathSelector(jsonPath); - return selectList(jsonPathSelector,strings); + return selectList(jsonPathSelector,getSourceTexts()); } } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/PlainText.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/PlainText.java index c1d034a2..557763b1 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/PlainText.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/PlainText.java @@ -12,12 +12,15 @@ import java.util.List; */ public class PlainText extends AbstractSelectable { - public PlainText(List strings) { - super(strings); + protected List sourceTexts; + + public PlainText(List sourceTexts) { + this.sourceTexts = sourceTexts; } public PlainText(String text) { - super(text); + this.sourceTexts = new ArrayList(); + sourceTexts.add(text); } public static PlainText create(String text) { @@ -51,11 +54,15 @@ public class PlainText extends AbstractSelectable { @Override public List nodes() { - List nodes = new ArrayList(strings.size()); - for (String string : strings) { + List nodes = new ArrayList(getSourceTexts().size()); + for (String string : getSourceTexts()) { nodes.add(PlainText.create(string)); } return nodes; } + @Override + protected List getSourceTexts() { + return sourceTexts; + } } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/XpathSelector.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/XpathSelector.java index 4516a3de..8a980a50 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/XpathSelector.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/XpathSelector.java @@ -2,7 +2,6 @@ package us.codecraft.webmagic.selector; import org.apache.commons.collections.CollectionUtils; import org.jsoup.nodes.Element; -import org.jsoup.select.Elements; import us.codecraft.xsoup.XPathEvaluator; import us.codecraft.xsoup.Xsoup; @@ -34,7 +33,7 @@ public class XpathSelector extends BaseElementSelector { @Override public Element selectElement(Element element) { - Elements elements = selectElements(element); + List elements = selectElements(element); if (CollectionUtils.isNotEmpty(elements)){ return elements.get(0); } @@ -42,7 +41,12 @@ public class XpathSelector extends BaseElementSelector { } @Override - public Elements selectElements(Element element) { + public List selectElements(Element element) { return xPathEvaluator.evaluate(element).getElements(); } + + @Override + public boolean hasAttribute() { + return xPathEvaluator.hasAttribute(); + } } diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpClientDownloaderTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpClientDownloaderTest.java index 084a1100..352e49cb 100644 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpClientDownloaderTest.java +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpClientDownloaderTest.java @@ -39,7 +39,7 @@ public class HttpClientDownloaderTest { public void testDownloader() { HttpClientDownloader httpClientDownloader = new HttpClientDownloader(); Html html = httpClientDownloader.download("https://github.com"); - assertTrue(!html.getText().isEmpty()); + assertTrue(!html.getFirstSourceText().isEmpty()); } @Test(expected = IllegalArgumentException.class)