diff --git a/webmagic-core/pom.xml b/webmagic-core/pom.xml index b19820df..ef9f84aa 100644 --- a/webmagic-core/pom.xml +++ b/webmagic-core/pom.xml @@ -25,6 +25,12 @@ commons-lang3 + + us.codecraft + xsoup + 0.0.1-SNAPSHOT + + log4j log4j diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/BaseElementSelector.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/BaseElementSelector.java new file mode 100644 index 00000000..d14a708a --- /dev/null +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/BaseElementSelector.java @@ -0,0 +1,23 @@ +package us.codecraft.webmagic.selector; + +import org.jsoup.Jsoup; + +import java.util.List; + +/** + * @author code4crafter@gmail.com + * @since 0.2.2 + */ +public abstract class BaseElementSelector implements Selector,ElementSelector { + + @Override + public String select(String text) { + return select(Jsoup.parse(text)); + } + + @Override + public List selectList(String text) { + return selectList(Jsoup.parse(text)); + } + +} diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/CssSelector.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/CssSelector.java index 5031077c..9c7032c0 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/CssSelector.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/CssSelector.java @@ -1,8 +1,6 @@ package us.codecraft.webmagic.selector; import org.apache.commons.collections.CollectionUtils; -import org.jsoup.Jsoup; -import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; @@ -15,7 +13,7 @@ import java.util.List; * @author code4crafter@gmail.com
* @since 0.1.0 */ -public class CssSelector implements Selector { +public class CssSelector extends BaseElementSelector { private String selectorText; @@ -30,16 +28,6 @@ public class CssSelector implements Selector { this.attrName = attrName; } - @Override - public String select(String text) { - Document doc = Jsoup.parse(text); - Elements elements = doc.select(selectorText); - if (CollectionUtils.isEmpty(elements)) { - return null; - } - return getValue(elements.get(0)); - } - private String getValue(Element element) { if (attrName == null) { return element.outerHtml(); @@ -51,9 +39,17 @@ public class CssSelector implements Selector { } @Override - public List selectList(String text) { + public String select(Element element) { + Elements elements = element.select(selectorText); + if (CollectionUtils.isEmpty(elements)) { + return null; + } + return getValue(elements.get(0)); + } + + @Override + public List selectList(Element doc) { List strings = new ArrayList(); - Document doc = Jsoup.parse(text); Elements elements = doc.select(selectorText); if (CollectionUtils.isNotEmpty(elements)) { for (Element element : elements) { diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/ElementSelector.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/ElementSelector.java new file mode 100644 index 00000000..793b8256 --- /dev/null +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/ElementSelector.java @@ -0,0 +1,32 @@ +package us.codecraft.webmagic.selector; + +import org.jsoup.nodes.Element; + +import java.util.List; + +/** + * Selector(extractor) for html elements.
+ * + * @author code4crafter@gmail.com
+ * @since 0.2.2 + */ +public interface ElementSelector { + + /** + * Extract single result in text.
+ * If there are more than one result, only the first will be chosen. + * + * @param element + * @return result + */ + public String select(Element element); + + /** + * Extract all results in text.
+ * + * @param element + * @return results + */ + public List selectList(Element element); + +} diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/XsoupSelector.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/XsoupSelector.java new file mode 100644 index 00000000..698b29bd --- /dev/null +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/XsoupSelector.java @@ -0,0 +1,32 @@ +package us.codecraft.webmagic.selector; + +import org.jsoup.nodes.Element; +import us.codecraft.xsoup.XPathEvaluator; +import us.codecraft.xsoup.Xsoup; + +import java.util.List; + +/** + * XPath selector based on Xsoup.
+ * + * @author code4crafter@gmail.com
+ * @since 0.2.2 + */ +public class XsoupSelector extends BaseElementSelector { + + private XPathEvaluator xPathEvaluator; + + public XsoupSelector(String xpathStr) { + this.xPathEvaluator = Xsoup.compile(xpathStr); + } + + @Override + public String select(Element element) { + return xPathEvaluator.evaluate(element).get(); + } + + @Override + public List selectList(Element element) { + return xPathEvaluator.evaluate(element).list(); + } +}