diff --git a/pom.xml b/pom.xml
index 1497bb0c..cb354e29 100644
--- a/pom.xml
+++ b/pom.xml
@@ -14,6 +14,7 @@
webmagic-samples/
webmagic-selenium/
webmagic-lucene/
+ webmagic-saxon/
diff --git a/webmagic-extension/pom.xml b/webmagic-extension/pom.xml
index 843c2c3c..63034f23 100644
--- a/webmagic-extension/pom.xml
+++ b/webmagic-extension/pom.xml
@@ -27,10 +27,6 @@
webmagic-core
${project.version}
-
- net.sf.saxon
- Saxon-HE
-
junit
junit
diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java
index 445bdd92..158e74d1 100644
--- a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java
+++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java
@@ -110,11 +110,8 @@ class PageModelExtractor {
case XPath:
selector = new XpathSelector(value);
break;
- case XPath2:
- selector = new Xpath2Selector(value);
- break;
default:
- selector = new Xpath2Selector(value);
+ selector = new XpathSelector(value);
}
fieldExtractor = new FieldExtractor(field, selector, FieldExtractor.Source.Html, extractBy.notNull(), extractBy.multi());
Method setterMethod = getSetterMethod(clazz, field);
@@ -140,11 +137,8 @@ class PageModelExtractor {
case XPath:
selector = new XpathSelector(value);
break;
- case XPath2:
- selector = new Xpath2Selector(value);
- break;
default:
- selector = new Xpath2Selector(value);
+ selector = new XpathSelector(value);
}
fieldExtractor.setSelector(new AndSelector(fieldExtractor.getSelector(), selector));
}
@@ -165,11 +159,8 @@ class PageModelExtractor {
case XPath:
selector = new XpathSelector(value);
break;
- case XPath2:
- selector = new Xpath2Selector(value);
- break;
default:
- selector = new Xpath2Selector(value);
+ selector = new XpathSelector(value);
}
fieldExtractor.setSelector(new AndSelector(fieldExtractor.getSelector(), selector));
}
@@ -191,11 +182,8 @@ class PageModelExtractor {
case XPath:
selector = new XpathSelector(value);
break;
- case XPath2:
- selector = new Xpath2Selector(value);
- break;
default:
- selector = new Xpath2Selector(value);
+ selector = new XpathSelector(value);
}
fieldExtractor = new FieldExtractor(field, selector, FieldExtractor.Source.RawHtml, extractByRaw.notNull(), extractByRaw.multi());
Method setterMethod = getSetterMethod(clazz, field);
@@ -228,7 +216,7 @@ class PageModelExtractor {
targetUrlPatterns.add(Pattern.compile("(" + s.replace(".", "\\.").replace("*", "[^\"'#]*") + ")"));
}
if (!targetUrl.sourceRegion().equals("")) {
- targetUrlRegionSelector = new Xpath2Selector(targetUrl.sourceRegion());
+ targetUrlRegionSelector = new XpathSelector(targetUrl.sourceRegion());
}
}
annotation = clazz.getAnnotation(HelpUrl.class);
@@ -239,13 +227,13 @@ class PageModelExtractor {
helpUrlPatterns.add(Pattern.compile("(" + s.replace(".", "\\.").replace("*", "[^\"'#]*") + ")"));
}
if (!helpUrl.sourceRegion().equals("")) {
- helpUrlRegionSelector = new Xpath2Selector(helpUrl.sourceRegion());
+ helpUrlRegionSelector = new XpathSelector(helpUrl.sourceRegion());
}
}
annotation = clazz.getAnnotation(ExtractBy.class);
if (annotation != null) {
ExtractBy extractBy = (ExtractBy) annotation;
- extractor = new Extractor(new Xpath2Selector(extractBy.value()), Extractor.Source.Html, extractBy.notNull(), extractBy.multi());
+ extractor = new Extractor(new XpathSelector(extractBy.value()), Extractor.Source.Html, extractBy.notNull(), extractBy.multi());
}
}
diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ExtractBy.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ExtractBy.java
index af8946a1..2fcdb82e 100644
--- a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ExtractBy.java
+++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ExtractBy.java
@@ -16,9 +16,9 @@ public @interface ExtractBy {
String value();
- public enum Type {XPath2, XPath, Regex, Css}
+ public enum Type {XPath, Regex, Css}
- Type type() default Type.XPath2;
+ Type type() default Type.XPath;
boolean notNull() default true;
diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ExtractBy2.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ExtractBy2.java
index f68b7d64..ad720b3a 100644
--- a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ExtractBy2.java
+++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ExtractBy2.java
@@ -6,6 +6,7 @@ import java.lang.annotation.Target;
/**
* 定义类或者字段的抽取规则。
+ *
* @author code4crafter@gmail.com
* @date: 13-8-1
* Time: 下午8:40
@@ -16,8 +17,8 @@ public @interface ExtractBy2 {
String value();
- public enum Type {XPath2, XPath, Regex, Css}
+ public enum Type {XPath, Regex, Css}
- Type type() default Type.XPath2;
+ Type type() default Type.XPath;
}
diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ExtractBy3.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ExtractBy3.java
index f3212a6a..023360ef 100644
--- a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ExtractBy3.java
+++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ExtractBy3.java
@@ -16,8 +16,8 @@ public @interface ExtractBy3 {
String value();
- public enum Type {XPath2, XPath, Regex, Css}
+ public enum Type { XPath, Regex, Css}
- Type type() default Type.XPath2;
+ Type type() default Type.XPath;
}
diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ExtractByRaw.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ExtractByRaw.java
index 96927320..1bd3da1e 100644
--- a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ExtractByRaw.java
+++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ExtractByRaw.java
@@ -16,9 +16,9 @@ public @interface ExtractByRaw {
String value();
- public enum Type {XPath2, XPath, Regex, Css}
+ public enum Type {XPath, Regex, Css}
- Type type() default Type.XPath2;
+ Type type() default Type.XPath;
boolean notNull() default true;
diff --git a/webmagic-saxon/README.md b/webmagic-saxon/README.md
new file mode 100644
index 00000000..0471c68b
--- /dev/null
+++ b/webmagic-saxon/README.md
@@ -0,0 +1,3 @@
+webmagic-extension
+-------
+webmagic的扩展模块,依赖Saxon进行xpath2.0解析支持。Saxon依赖包太大,不作为默认模块引入。
\ No newline at end of file
diff --git a/webmagic-saxon/pom.xml b/webmagic-saxon/pom.xml
new file mode 100644
index 00000000..a2db7685
--- /dev/null
+++ b/webmagic-saxon/pom.xml
@@ -0,0 +1,30 @@
+
+
+
+ us.codecraft
+ webmagic
+ 0.2.0
+
+ 4.0.0
+
+ webmagic-saxon
+
+
+
+ us.codecraft
+ webmagic-core
+ ${project.version}
+
+
+ net.sf.saxon
+ Saxon-HE
+
+
+ junit
+ junit
+
+
+
+
\ No newline at end of file
diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/selector/Xpath2Selector.java b/webmagic-saxon/src/main/java/us/codecraft/webmagic/selector/Xpath2Selector.java
similarity index 100%
rename from webmagic-extension/src/main/java/us/codecraft/webmagic/selector/Xpath2Selector.java
rename to webmagic-saxon/src/main/java/us/codecraft/webmagic/selector/Xpath2Selector.java
diff --git a/webmagic-extension/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java b/webmagic-saxon/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java
similarity index 99%
rename from webmagic-extension/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java
rename to webmagic-saxon/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java
index 9f32a8f1..b6230406 100644
--- a/webmagic-extension/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java
+++ b/webmagic-saxon/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java
@@ -1,25 +1,8 @@
package us.codecraft.webmagic.selector;
-import net.sf.saxon.Configuration;
-import net.sf.saxon.lib.NamespaceConstant;
-import net.sf.saxon.om.NamespaceResolver;
-import net.sf.saxon.pull.NamespaceContextImpl;
-import net.sf.saxon.xpath.JAXPXPathStaticContext;
-import net.sf.saxon.xpath.XPathEvaluator;
-import net.sf.saxon.xpath.XPathFactoryImpl;
-import org.htmlcleaner.CleanerProperties;
-import org.htmlcleaner.DomSerializer;
-import org.htmlcleaner.HtmlCleaner;
-import org.htmlcleaner.TagNode;
import org.junit.Assert;
import org.junit.Ignore;
import org.junit.Test;
-import org.w3c.dom.Document;
-import org.w3c.dom.NodeList;
-
-import javax.xml.xpath.*;
-import java.util.Collections;
-import java.util.Iterator;
/**
* @author code4crafter@gmail.com
Date: 13-4-21 Time: 上午10:06