refactor of selectable cont' #113

1. remove lazy init of Html
2. rename strings to sourceTexts for better meaning
3. make getSourceTexts abstract and DO NOT always store strings
4. instead store parsed elements of document in HtmlNode
pull/130/head
yihua.huang 11 years ago
parent f9825c214a
commit 41c2ea9498

@ -88,7 +88,7 @@
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>xsoup</artifactId>
<version>0.2.3</version>
<version>0.2.4-SNAPSHOT</version>
</dependency>
<dependency>
<groupId>com.alibaba</groupId>

@ -11,17 +11,7 @@ import java.util.List;
*/
public abstract class AbstractSelectable implements Selectable {
protected List<String> strings;
public AbstractSelectable(String text) {
List<String> results = new ArrayList<String>();
results.add(text);
this.strings = results;
}
public AbstractSelectable(List<String> strings) {
this.strings = strings;
}
protected abstract List<String> getSourceTexts();
@Override
public Selectable css(String selector) {
@ -55,7 +45,7 @@ public abstract class AbstractSelectable implements Selectable {
@Override
public List<String> all() {
return strings;
return getSourceTexts();
}
@Override
@ -74,30 +64,37 @@ public abstract class AbstractSelectable implements Selectable {
@Override
public Selectable select(Selector selector) {
return select(selector, strings);
return select(selector, getSourceTexts());
}
@Override
public Selectable selectList(Selector selector) {
return selectList(selector, strings);
return selectList(selector, getSourceTexts());
}
@Override
public Selectable regex(String regex) {
RegexSelector regexSelector = Selectors.regex(regex);
return selectList(regexSelector, strings);
return selectList(regexSelector, getSourceTexts());
}
@Override
public Selectable regex(String regex, int group) {
RegexSelector regexSelector = Selectors.regex(regex, group);
return selectList(regexSelector, strings);
return selectList(regexSelector, getSourceTexts());
}
@Override
public Selectable replace(String regex, String replacement) {
ReplaceSelector replaceSelector = new ReplaceSelector(regex,replacement);
return select(replaceSelector, strings);
return select(replaceSelector, getSourceTexts());
}
public String getFirstSourceText() {
if (getSourceTexts() != null && getSourceTexts().size() > 0) {
return getSourceTexts().get(0);
}
return null;
}
@Override
@ -107,6 +104,6 @@ public abstract class AbstractSelectable implements Selectable {
@Override
public boolean match() {
return strings != null && strings.size() > 0;
return getSourceTexts() != null && getSourceTexts().size() > 0;
}
}

@ -2,7 +2,6 @@ package us.codecraft.webmagic.selector;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.util.ArrayList;
import java.util.List;
@ -37,16 +36,18 @@ public abstract class BaseElementSelector implements Selector, ElementSelector {
return null;
}
public Elements selectElements(String text) {
public List<Element> selectElements(String text) {
if (text != null) {
return selectElements(Jsoup.parse(text));
} else {
return new Elements();
return new ArrayList<Element>();
}
}
public abstract Element selectElement(Element element);
public abstract Elements selectElements(Element element);
public abstract List<Element> selectElements(Element element);
public abstract boolean hasAttribute();
}

@ -57,7 +57,7 @@ public class CssSelector extends BaseElementSelector {
@Override
public String select(Element element) {
Elements elements = selectElements(element);
List<Element> elements = selectElements(element);
if (CollectionUtils.isEmpty(elements)) {
return null;
}
@ -67,7 +67,7 @@ public class CssSelector extends BaseElementSelector {
@Override
public List<String> selectList(Element doc) {
List<String> strings = new ArrayList<String>();
Elements elements = selectElements(doc);
List<Element> elements = selectElements(doc);
if (CollectionUtils.isNotEmpty(elements)) {
for (Element element : elements) {
String value = getValue(element);
@ -89,7 +89,12 @@ public class CssSelector extends BaseElementSelector {
}
@Override
public Elements selectElements(Element element) {
public List<Element> selectElements(Element element) {
return element.select(selectorText);
}
@Override
public boolean hasAttribute() {
return attrName != null;
}
}

@ -2,10 +2,11 @@ package us.codecraft.webmagic.selector;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
/**
@ -14,7 +15,7 @@ import java.util.List;
* @author code4crafter@gmail.com <br>
* @since 0.1.0
*/
public class Html extends PlainText {
public class Html extends HtmlNode {
private Logger logger = LoggerFactory.getLogger(getClass());
@ -23,130 +24,26 @@ public class Html extends PlainText {
*/
private Document document;
private boolean needInitCache = true;
public Html(List<String> strings) {
super(strings);
}
public Html(String text) {
super(text);
}
public Html(List<String> strings, boolean needInitCache) {
super(strings);
this.needInitCache = needInitCache;
}
public Html(String text, boolean needInitCache) {
super(text);
this.needInitCache = needInitCache;
}
/**
* lazy init
*/
private void initDocument() {
if (this.document == null && needInitCache) {
needInitCache = false;
//just init once whether the parsing succeeds or not
try {
this.document = Jsoup.parse(getText());
} catch (Exception e) {
logger.warn("parse document error ", e);
}
try {
this.document = Jsoup.parse(text);
} catch (Exception e) {
this.document = null;
logger.warn("parse document error ", e);
}
}
public Html(Document document) {
super(document.html());
this.document = document;
}
public static Html create(String text) {
return new Html(text);
}
@Override
protected Selectable select(Selector selector, List<String> strings) {
initDocument();
List<String> results = new ArrayList<String>();
for (String string : strings) {
String result = selector.select(string);
if (result != null) {
results.add(result);
}
}
return new Html(results, false);
}
@Override
protected Selectable selectList(Selector selector, List<String> strings) {
initDocument();
List<String> results = new ArrayList<String>();
for (String string : strings) {
List<String> result = selector.selectList(string);
results.addAll(result);
}
return new Html(results, false);
}
@Override
public Selectable smartContent() {
initDocument();
SmartContentSelector smartContentSelector = Selectors.smartContent();
return select(smartContentSelector, strings);
}
@Override
public Selectable links() {
return xpath("//a/@href");
}
@Override
public Selectable xpath(String xpath) {
XpathSelector xpathSelector = Selectors.xpath(xpath);
if (document != null) {
return new Html(xpathSelector.selectList(document), false);
}
return selectList(xpathSelector, strings);
}
@Override
public Selectable $(String selector) {
CssSelector cssSelector = Selectors.$(selector);
if (document != null) {
return new Html(cssSelector.selectList(document), false);
}
return selectList(cssSelector, strings);
}
@Override
public Selectable $(String selector, String attrName) {
CssSelector cssSelector = Selectors.$(selector, attrName);
if (document != null) {
return new Html(cssSelector.selectList(document), false);
}
return selectList(cssSelector, strings);
}
public Document getDocument() {
initDocument();
return document;
}
public String getText() {
if (strings != null && strings.size() > 0) {
return strings.get(0);
}
return document.html();
}
@Override
public List<Selectable> nodes() {
ArrayList<Selectable> selectables = new ArrayList<Selectable>();
selectables.add(this);
return selectables;
protected List<Element> getElements() {
return Collections.<Element>singletonList(getDocument());
}
/**
@ -158,7 +55,7 @@ public class Html extends PlainText {
ElementSelector elementSelector = (ElementSelector) selector;
return elementSelector.select(getDocument());
} else {
return selector.select(getText());
return selector.select(getFirstSourceText());
}
}
@ -167,7 +64,12 @@ public class Html extends PlainText {
ElementSelector elementSelector = (ElementSelector) selector;
return elementSelector.selectList(getDocument());
} else {
return selector.selectList(getText());
return selector.selectList(getFirstSourceText());
}
}
public static Html create(String text) {
return new Html(text);
}
}

@ -1,7 +0,0 @@
package us.codecraft.webmagic.selector;
/**
* @author code4crafer@gmail.com
*/
public class HtmlFragment {
}

@ -0,0 +1,97 @@
package us.codecraft.webmagic.selector;
import org.jsoup.nodes.Element;
import java.util.ArrayList;
import java.util.List;
/**
* @author code4crafer@gmail.com
*/
public class HtmlNode extends AbstractSelectable {
private final List<Element> elements;
public HtmlNode(List<Element> elements) {
this.elements = elements;
}
public HtmlNode() {
elements = null;
}
protected List<Element> getElements() {
return elements;
}
@Override
public Selectable smartContent() {
SmartContentSelector smartContentSelector = Selectors.smartContent();
return select(smartContentSelector, getSourceTexts());
}
@Override
public Selectable links() {
return xpath("//a/@href");
}
@Override
public Selectable xpath(String xpath) {
XpathSelector xpathSelector = Selectors.xpath(xpath);
return selectElements(xpathSelector);
}
/**
* select elements
*
* @param elementSelector
* @return
*/
protected Selectable selectElements(BaseElementSelector elementSelector) {
if (!elementSelector.hasAttribute()) {
List<Element> resultElements = new ArrayList<Element>();
for (Element element : getElements()) {
List<Element> selectElements = elementSelector.selectElements(element);
resultElements.addAll(selectElements);
}
return new HtmlNode(resultElements);
} else {
// has attribute, consider as plaintext
List<String> resultStrings = new ArrayList<String>();
for (Element element : getElements()) {
List<String> selectList = elementSelector.selectList(element);
resultStrings.addAll(selectList);
}
return new PlainText(resultStrings);
}
}
@Override
public Selectable $(String selector) {
CssSelector cssSelector = Selectors.$(selector);
return selectElements(cssSelector);
}
@Override
public Selectable $(String selector, String attrName) {
CssSelector cssSelector = Selectors.$(selector, attrName);
return selectElements(cssSelector);
}
@Override
public List<Selectable> nodes() {
ArrayList<Selectable> selectables = new ArrayList<Selectable>();
selectables.add(this);
return selectables;
}
@Override
protected List<String> getSourceTexts() {
List<String> sourceTexts = new ArrayList<String>(getElements().size());
for (Element element : getElements()) {
sourceTexts.add(element.toString());
}
return sourceTexts;
}
}

@ -26,7 +26,7 @@ public class Json extends PlainText {
* @return
*/
public Json removePadding(String padding) {
String text = getText();
String text = getFirstSourceText();
XTokenQueue tokenQueue = new XTokenQueue(text);
tokenQueue.consumeWhitespace();
tokenQueue.consume(padding);
@ -36,29 +36,22 @@ public class Json extends PlainText {
}
public <T> T toObject(Class<T> clazz) {
if (getText() == null) {
if (getFirstSourceText() == null) {
return null;
}
return JSON.parseObject(getText(), clazz);
return JSON.parseObject(getFirstSourceText(), clazz);
}
public <T> List<T> toList(Class<T> clazz) {
if (getText() == null) {
if (getFirstSourceText() == null) {
return null;
}
return JSON.parseArray(getText(), clazz);
}
public String getText() {
if (strings != null && strings.size() > 0) {
return strings.get(0);
}
return null;
return JSON.parseArray(getFirstSourceText(), clazz);
}
@Override
public Selectable jsonPath(String jsonPath) {
JsonPathSelector jsonPathSelector = new JsonPathSelector(jsonPath);
return selectList(jsonPathSelector,strings);
return selectList(jsonPathSelector,getSourceTexts());
}
}

@ -12,12 +12,15 @@ import java.util.List;
*/
public class PlainText extends AbstractSelectable {
public PlainText(List<String> strings) {
super(strings);
protected List<String> sourceTexts;
public PlainText(List<String> sourceTexts) {
this.sourceTexts = sourceTexts;
}
public PlainText(String text) {
super(text);
this.sourceTexts = new ArrayList<String>();
sourceTexts.add(text);
}
public static PlainText create(String text) {
@ -51,11 +54,15 @@ public class PlainText extends AbstractSelectable {
@Override
public List<Selectable> nodes() {
List<Selectable> nodes = new ArrayList<Selectable>(strings.size());
for (String string : strings) {
List<Selectable> nodes = new ArrayList<Selectable>(getSourceTexts().size());
for (String string : getSourceTexts()) {
nodes.add(PlainText.create(string));
}
return nodes;
}
@Override
protected List<String> getSourceTexts() {
return sourceTexts;
}
}

@ -2,7 +2,6 @@ package us.codecraft.webmagic.selector;
import org.apache.commons.collections.CollectionUtils;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import us.codecraft.xsoup.XPathEvaluator;
import us.codecraft.xsoup.Xsoup;
@ -34,7 +33,7 @@ public class XpathSelector extends BaseElementSelector {
@Override
public Element selectElement(Element element) {
Elements elements = selectElements(element);
List<Element> elements = selectElements(element);
if (CollectionUtils.isNotEmpty(elements)){
return elements.get(0);
}
@ -42,7 +41,12 @@ public class XpathSelector extends BaseElementSelector {
}
@Override
public Elements selectElements(Element element) {
public List<Element> selectElements(Element element) {
return xPathEvaluator.evaluate(element).getElements();
}
@Override
public boolean hasAttribute() {
return xPathEvaluator.hasAttribute();
}
}

@ -39,7 +39,7 @@ public class HttpClientDownloaderTest {
public void testDownloader() {
HttpClientDownloader httpClientDownloader = new HttpClientDownloader();
Html html = httpClientDownloader.download("https://github.com");
assertTrue(!html.getText().isEmpty());
assertTrue(!html.getFirstSourceText().isEmpty());
}
@Test(expected = IllegalArgumentException.class)

Loading…
Cancel
Save