diff --git a/pom.xml b/pom.xml index 1497bb0c..cb354e29 100644 --- a/pom.xml +++ b/pom.xml @@ -14,6 +14,7 @@ webmagic-samples/ webmagic-selenium/ webmagic-lucene/ + webmagic-saxon/ diff --git a/webmagic-extension/pom.xml b/webmagic-extension/pom.xml index 843c2c3c..63034f23 100644 --- a/webmagic-extension/pom.xml +++ b/webmagic-extension/pom.xml @@ -27,10 +27,6 @@ webmagic-core ${project.version} - - net.sf.saxon - Saxon-HE - junit junit diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java index 445bdd92..158e74d1 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java @@ -110,11 +110,8 @@ class PageModelExtractor { case XPath: selector = new XpathSelector(value); break; - case XPath2: - selector = new Xpath2Selector(value); - break; default: - selector = new Xpath2Selector(value); + selector = new XpathSelector(value); } fieldExtractor = new FieldExtractor(field, selector, FieldExtractor.Source.Html, extractBy.notNull(), extractBy.multi()); Method setterMethod = getSetterMethod(clazz, field); @@ -140,11 +137,8 @@ class PageModelExtractor { case XPath: selector = new XpathSelector(value); break; - case XPath2: - selector = new Xpath2Selector(value); - break; default: - selector = new Xpath2Selector(value); + selector = new XpathSelector(value); } fieldExtractor.setSelector(new AndSelector(fieldExtractor.getSelector(), selector)); } @@ -165,11 +159,8 @@ class PageModelExtractor { case XPath: selector = new XpathSelector(value); break; - case XPath2: - selector = new Xpath2Selector(value); - break; default: - selector = new Xpath2Selector(value); + selector = new XpathSelector(value); } fieldExtractor.setSelector(new AndSelector(fieldExtractor.getSelector(), selector)); } @@ -191,11 +182,8 @@ class PageModelExtractor { case XPath: selector = new XpathSelector(value); break; - case XPath2: - selector = new Xpath2Selector(value); - break; default: - selector = new Xpath2Selector(value); + selector = new XpathSelector(value); } fieldExtractor = new FieldExtractor(field, selector, FieldExtractor.Source.RawHtml, extractByRaw.notNull(), extractByRaw.multi()); Method setterMethod = getSetterMethod(clazz, field); @@ -228,7 +216,7 @@ class PageModelExtractor { targetUrlPatterns.add(Pattern.compile("(" + s.replace(".", "\\.").replace("*", "[^\"'#]*") + ")")); } if (!targetUrl.sourceRegion().equals("")) { - targetUrlRegionSelector = new Xpath2Selector(targetUrl.sourceRegion()); + targetUrlRegionSelector = new XpathSelector(targetUrl.sourceRegion()); } } annotation = clazz.getAnnotation(HelpUrl.class); @@ -239,13 +227,13 @@ class PageModelExtractor { helpUrlPatterns.add(Pattern.compile("(" + s.replace(".", "\\.").replace("*", "[^\"'#]*") + ")")); } if (!helpUrl.sourceRegion().equals("")) { - helpUrlRegionSelector = new Xpath2Selector(helpUrl.sourceRegion()); + helpUrlRegionSelector = new XpathSelector(helpUrl.sourceRegion()); } } annotation = clazz.getAnnotation(ExtractBy.class); if (annotation != null) { ExtractBy extractBy = (ExtractBy) annotation; - extractor = new Extractor(new Xpath2Selector(extractBy.value()), Extractor.Source.Html, extractBy.notNull(), extractBy.multi()); + extractor = new Extractor(new XpathSelector(extractBy.value()), Extractor.Source.Html, extractBy.notNull(), extractBy.multi()); } } diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ExtractBy.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ExtractBy.java index af8946a1..2fcdb82e 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ExtractBy.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ExtractBy.java @@ -16,9 +16,9 @@ public @interface ExtractBy { String value(); - public enum Type {XPath2, XPath, Regex, Css} + public enum Type {XPath, Regex, Css} - Type type() default Type.XPath2; + Type type() default Type.XPath; boolean notNull() default true; diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ExtractBy2.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ExtractBy2.java index f68b7d64..ad720b3a 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ExtractBy2.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ExtractBy2.java @@ -6,6 +6,7 @@ import java.lang.annotation.Target; /** * 定义类或者字段的抽取规则。
+ * * @author code4crafter@gmail.com
* @date: 13-8-1
* Time: 下午8:40
@@ -16,8 +17,8 @@ public @interface ExtractBy2 { String value(); - public enum Type {XPath2, XPath, Regex, Css} + public enum Type {XPath, Regex, Css} - Type type() default Type.XPath2; + Type type() default Type.XPath; } diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ExtractBy3.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ExtractBy3.java index f3212a6a..023360ef 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ExtractBy3.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ExtractBy3.java @@ -16,8 +16,8 @@ public @interface ExtractBy3 { String value(); - public enum Type {XPath2, XPath, Regex, Css} + public enum Type { XPath, Regex, Css} - Type type() default Type.XPath2; + Type type() default Type.XPath; } diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ExtractByRaw.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ExtractByRaw.java index 96927320..1bd3da1e 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ExtractByRaw.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ExtractByRaw.java @@ -16,9 +16,9 @@ public @interface ExtractByRaw { String value(); - public enum Type {XPath2, XPath, Regex, Css} + public enum Type {XPath, Regex, Css} - Type type() default Type.XPath2; + Type type() default Type.XPath; boolean notNull() default true; diff --git a/webmagic-saxon/README.md b/webmagic-saxon/README.md new file mode 100644 index 00000000..0471c68b --- /dev/null +++ b/webmagic-saxon/README.md @@ -0,0 +1,3 @@ +webmagic-extension +------- +webmagic的扩展模块,依赖Saxon进行xpath2.0解析支持。Saxon依赖包太大,不作为默认模块引入。 \ No newline at end of file diff --git a/webmagic-saxon/pom.xml b/webmagic-saxon/pom.xml new file mode 100644 index 00000000..a2db7685 --- /dev/null +++ b/webmagic-saxon/pom.xml @@ -0,0 +1,30 @@ + + + + us.codecraft + webmagic + 0.2.0 + + 4.0.0 + + webmagic-saxon + + + + us.codecraft + webmagic-core + ${project.version} + + + net.sf.saxon + Saxon-HE + + + junit + junit + + + + \ No newline at end of file diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/selector/Xpath2Selector.java b/webmagic-saxon/src/main/java/us/codecraft/webmagic/selector/Xpath2Selector.java similarity index 100% rename from webmagic-extension/src/main/java/us/codecraft/webmagic/selector/Xpath2Selector.java rename to webmagic-saxon/src/main/java/us/codecraft/webmagic/selector/Xpath2Selector.java diff --git a/webmagic-extension/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java b/webmagic-saxon/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java similarity index 99% rename from webmagic-extension/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java rename to webmagic-saxon/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java index 9f32a8f1..b6230406 100644 --- a/webmagic-extension/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java +++ b/webmagic-saxon/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java @@ -1,25 +1,8 @@ package us.codecraft.webmagic.selector; -import net.sf.saxon.Configuration; -import net.sf.saxon.lib.NamespaceConstant; -import net.sf.saxon.om.NamespaceResolver; -import net.sf.saxon.pull.NamespaceContextImpl; -import net.sf.saxon.xpath.JAXPXPathStaticContext; -import net.sf.saxon.xpath.XPathEvaluator; -import net.sf.saxon.xpath.XPathFactoryImpl; -import org.htmlcleaner.CleanerProperties; -import org.htmlcleaner.DomSerializer; -import org.htmlcleaner.HtmlCleaner; -import org.htmlcleaner.TagNode; import org.junit.Assert; import org.junit.Ignore; import org.junit.Test; -import org.w3c.dom.Document; -import org.w3c.dom.NodeList; - -import javax.xml.xpath.*; -import java.util.Collections; -import java.util.Iterator; /** * @author code4crafter@gmail.com
Date: 13-4-21 Time: 上午10:06