diff --git a/webmagic-core/pom.xml b/webmagic-core/pom.xml
index b19820df..ef9f84aa 100644
--- a/webmagic-core/pom.xml
+++ b/webmagic-core/pom.xml
@@ -25,6 +25,12 @@
commons-lang3
+
+ us.codecraft
+ xsoup
+ 0.0.1-SNAPSHOT
+
+
log4j
log4j
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/BaseElementSelector.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/BaseElementSelector.java
new file mode 100644
index 00000000..d14a708a
--- /dev/null
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/BaseElementSelector.java
@@ -0,0 +1,23 @@
+package us.codecraft.webmagic.selector;
+
+import org.jsoup.Jsoup;
+
+import java.util.List;
+
+/**
+ * @author code4crafter@gmail.com
+ * @since 0.2.2
+ */
+public abstract class BaseElementSelector implements Selector,ElementSelector {
+
+ @Override
+ public String select(String text) {
+ return select(Jsoup.parse(text));
+ }
+
+ @Override
+ public List selectList(String text) {
+ return selectList(Jsoup.parse(text));
+ }
+
+}
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/CssSelector.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/CssSelector.java
index 5031077c..9c7032c0 100644
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/CssSelector.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/CssSelector.java
@@ -1,8 +1,6 @@
package us.codecraft.webmagic.selector;
import org.apache.commons.collections.CollectionUtils;
-import org.jsoup.Jsoup;
-import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
@@ -15,7 +13,7 @@ import java.util.List;
* @author code4crafter@gmail.com
* @since 0.1.0
*/
-public class CssSelector implements Selector {
+public class CssSelector extends BaseElementSelector {
private String selectorText;
@@ -30,16 +28,6 @@ public class CssSelector implements Selector {
this.attrName = attrName;
}
- @Override
- public String select(String text) {
- Document doc = Jsoup.parse(text);
- Elements elements = doc.select(selectorText);
- if (CollectionUtils.isEmpty(elements)) {
- return null;
- }
- return getValue(elements.get(0));
- }
-
private String getValue(Element element) {
if (attrName == null) {
return element.outerHtml();
@@ -51,9 +39,17 @@ public class CssSelector implements Selector {
}
@Override
- public List selectList(String text) {
+ public String select(Element element) {
+ Elements elements = element.select(selectorText);
+ if (CollectionUtils.isEmpty(elements)) {
+ return null;
+ }
+ return getValue(elements.get(0));
+ }
+
+ @Override
+ public List selectList(Element doc) {
List strings = new ArrayList();
- Document doc = Jsoup.parse(text);
Elements elements = doc.select(selectorText);
if (CollectionUtils.isNotEmpty(elements)) {
for (Element element : elements) {
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/ElementSelector.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/ElementSelector.java
new file mode 100644
index 00000000..793b8256
--- /dev/null
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/ElementSelector.java
@@ -0,0 +1,32 @@
+package us.codecraft.webmagic.selector;
+
+import org.jsoup.nodes.Element;
+
+import java.util.List;
+
+/**
+ * Selector(extractor) for html elements.
+ *
+ * @author code4crafter@gmail.com
+ * @since 0.2.2
+ */
+public interface ElementSelector {
+
+ /**
+ * Extract single result in text.
+ * If there are more than one result, only the first will be chosen.
+ *
+ * @param element
+ * @return result
+ */
+ public String select(Element element);
+
+ /**
+ * Extract all results in text.
+ *
+ * @param element
+ * @return results
+ */
+ public List selectList(Element element);
+
+}
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/XsoupSelector.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/XsoupSelector.java
new file mode 100644
index 00000000..698b29bd
--- /dev/null
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/XsoupSelector.java
@@ -0,0 +1,32 @@
+package us.codecraft.webmagic.selector;
+
+import org.jsoup.nodes.Element;
+import us.codecraft.xsoup.XPathEvaluator;
+import us.codecraft.xsoup.Xsoup;
+
+import java.util.List;
+
+/**
+ * XPath selector based on Xsoup.
+ *
+ * @author code4crafter@gmail.com
+ * @since 0.2.2
+ */
+public class XsoupSelector extends BaseElementSelector {
+
+ private XPathEvaluator xPathEvaluator;
+
+ public XsoupSelector(String xpathStr) {
+ this.xPathEvaluator = Xsoup.compile(xpathStr);
+ }
+
+ @Override
+ public String select(Element element) {
+ return xPathEvaluator.evaluate(element).get();
+ }
+
+ @Override
+ public List selectList(Element element) {
+ return xPathEvaluator.evaluate(element).list();
+ }
+}