move xpath2.0 support to seperate package

pull/17/head
yihua.huang 12 years ago
parent 268bd8d0c4
commit 521fbad987

@ -14,6 +14,7 @@
<module>webmagic-samples/</module> <module>webmagic-samples/</module>
<module>webmagic-selenium/</module> <module>webmagic-selenium/</module>
<module>webmagic-lucene/</module> <module>webmagic-lucene/</module>
<module>webmagic-saxon/</module>
</modules> </modules>
<dependencyManagement> <dependencyManagement>

@ -27,10 +27,6 @@
<artifactId>webmagic-core</artifactId> <artifactId>webmagic-core</artifactId>
<version>${project.version}</version> <version>${project.version}</version>
</dependency> </dependency>
<dependency>
<groupId>net.sf.saxon</groupId>
<artifactId>Saxon-HE</artifactId>
</dependency>
<dependency> <dependency>
<groupId>junit</groupId> <groupId>junit</groupId>
<artifactId>junit</artifactId> <artifactId>junit</artifactId>

@ -110,11 +110,8 @@ class PageModelExtractor {
case XPath: case XPath:
selector = new XpathSelector(value); selector = new XpathSelector(value);
break; break;
case XPath2:
selector = new Xpath2Selector(value);
break;
default: default:
selector = new Xpath2Selector(value); selector = new XpathSelector(value);
} }
fieldExtractor = new FieldExtractor(field, selector, FieldExtractor.Source.Html, extractBy.notNull(), extractBy.multi()); fieldExtractor = new FieldExtractor(field, selector, FieldExtractor.Source.Html, extractBy.notNull(), extractBy.multi());
Method setterMethod = getSetterMethod(clazz, field); Method setterMethod = getSetterMethod(clazz, field);
@ -140,11 +137,8 @@ class PageModelExtractor {
case XPath: case XPath:
selector = new XpathSelector(value); selector = new XpathSelector(value);
break; break;
case XPath2:
selector = new Xpath2Selector(value);
break;
default: default:
selector = new Xpath2Selector(value); selector = new XpathSelector(value);
} }
fieldExtractor.setSelector(new AndSelector(fieldExtractor.getSelector(), selector)); fieldExtractor.setSelector(new AndSelector(fieldExtractor.getSelector(), selector));
} }
@ -165,11 +159,8 @@ class PageModelExtractor {
case XPath: case XPath:
selector = new XpathSelector(value); selector = new XpathSelector(value);
break; break;
case XPath2:
selector = new Xpath2Selector(value);
break;
default: default:
selector = new Xpath2Selector(value); selector = new XpathSelector(value);
} }
fieldExtractor.setSelector(new AndSelector(fieldExtractor.getSelector(), selector)); fieldExtractor.setSelector(new AndSelector(fieldExtractor.getSelector(), selector));
} }
@ -191,11 +182,8 @@ class PageModelExtractor {
case XPath: case XPath:
selector = new XpathSelector(value); selector = new XpathSelector(value);
break; break;
case XPath2:
selector = new Xpath2Selector(value);
break;
default: default:
selector = new Xpath2Selector(value); selector = new XpathSelector(value);
} }
fieldExtractor = new FieldExtractor(field, selector, FieldExtractor.Source.RawHtml, extractByRaw.notNull(), extractByRaw.multi()); fieldExtractor = new FieldExtractor(field, selector, FieldExtractor.Source.RawHtml, extractByRaw.notNull(), extractByRaw.multi());
Method setterMethod = getSetterMethod(clazz, field); Method setterMethod = getSetterMethod(clazz, field);
@ -228,7 +216,7 @@ class PageModelExtractor {
targetUrlPatterns.add(Pattern.compile("(" + s.replace(".", "\\.").replace("*", "[^\"'#]*") + ")")); targetUrlPatterns.add(Pattern.compile("(" + s.replace(".", "\\.").replace("*", "[^\"'#]*") + ")"));
} }
if (!targetUrl.sourceRegion().equals("")) { if (!targetUrl.sourceRegion().equals("")) {
targetUrlRegionSelector = new Xpath2Selector(targetUrl.sourceRegion()); targetUrlRegionSelector = new XpathSelector(targetUrl.sourceRegion());
} }
} }
annotation = clazz.getAnnotation(HelpUrl.class); annotation = clazz.getAnnotation(HelpUrl.class);
@ -239,13 +227,13 @@ class PageModelExtractor {
helpUrlPatterns.add(Pattern.compile("(" + s.replace(".", "\\.").replace("*", "[^\"'#]*") + ")")); helpUrlPatterns.add(Pattern.compile("(" + s.replace(".", "\\.").replace("*", "[^\"'#]*") + ")"));
} }
if (!helpUrl.sourceRegion().equals("")) { if (!helpUrl.sourceRegion().equals("")) {
helpUrlRegionSelector = new Xpath2Selector(helpUrl.sourceRegion()); helpUrlRegionSelector = new XpathSelector(helpUrl.sourceRegion());
} }
} }
annotation = clazz.getAnnotation(ExtractBy.class); annotation = clazz.getAnnotation(ExtractBy.class);
if (annotation != null) { if (annotation != null) {
ExtractBy extractBy = (ExtractBy) annotation; ExtractBy extractBy = (ExtractBy) annotation;
extractor = new Extractor(new Xpath2Selector(extractBy.value()), Extractor.Source.Html, extractBy.notNull(), extractBy.multi()); extractor = new Extractor(new XpathSelector(extractBy.value()), Extractor.Source.Html, extractBy.notNull(), extractBy.multi());
} }
} }

@ -16,9 +16,9 @@ public @interface ExtractBy {
String value(); String value();
public enum Type {XPath2, XPath, Regex, Css} public enum Type {XPath, Regex, Css}
Type type() default Type.XPath2; Type type() default Type.XPath;
boolean notNull() default true; boolean notNull() default true;

@ -6,6 +6,7 @@ import java.lang.annotation.Target;
/** /**
* <br> * <br>
*
* @author code4crafter@gmail.com <br> * @author code4crafter@gmail.com <br>
* @date: 13-8-1 <br> * @date: 13-8-1 <br>
* Time: 8:40 <br> * Time: 8:40 <br>
@ -16,8 +17,8 @@ public @interface ExtractBy2 {
String value(); String value();
public enum Type {XPath2, XPath, Regex, Css} public enum Type {XPath, Regex, Css}
Type type() default Type.XPath2; Type type() default Type.XPath;
} }

@ -16,8 +16,8 @@ public @interface ExtractBy3 {
String value(); String value();
public enum Type {XPath2, XPath, Regex, Css} public enum Type { XPath, Regex, Css}
Type type() default Type.XPath2; Type type() default Type.XPath;
} }

@ -16,9 +16,9 @@ public @interface ExtractByRaw {
String value(); String value();
public enum Type {XPath2, XPath, Regex, Css} public enum Type {XPath, Regex, Css}
Type type() default Type.XPath2; Type type() default Type.XPath;
boolean notNull() default true; boolean notNull() default true;

@ -0,0 +1,3 @@
webmagic-extension
-------
webmagic的扩展模块依赖Saxon进行xpath2.0解析支持。Saxon依赖包太大不作为默认模块引入。

@ -0,0 +1,30 @@
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<parent>
<groupId>us.codecraft</groupId>
<artifactId>webmagic</artifactId>
<version>0.2.0</version>
</parent>
<modelVersion>4.0.0</modelVersion>
<artifactId>webmagic-saxon</artifactId>
<dependencies>
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-core</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>net.sf.saxon</groupId>
<artifactId>Saxon-HE</artifactId>
</dependency>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
</dependency>
</dependencies>
</project>

@ -1,25 +1,8 @@
package us.codecraft.webmagic.selector; package us.codecraft.webmagic.selector;
import net.sf.saxon.Configuration;
import net.sf.saxon.lib.NamespaceConstant;
import net.sf.saxon.om.NamespaceResolver;
import net.sf.saxon.pull.NamespaceContextImpl;
import net.sf.saxon.xpath.JAXPXPathStaticContext;
import net.sf.saxon.xpath.XPathEvaluator;
import net.sf.saxon.xpath.XPathFactoryImpl;
import org.htmlcleaner.CleanerProperties;
import org.htmlcleaner.DomSerializer;
import org.htmlcleaner.HtmlCleaner;
import org.htmlcleaner.TagNode;
import org.junit.Assert; import org.junit.Assert;
import org.junit.Ignore; import org.junit.Ignore;
import org.junit.Test; import org.junit.Test;
import org.w3c.dom.Document;
import org.w3c.dom.NodeList;
import javax.xml.xpath.*;
import java.util.Collections;
import java.util.Iterator;
/** /**
* @author code4crafter@gmail.com <br> Date: 13-4-21 Time: 10:06 * @author code4crafter@gmail.com <br> Date: 13-4-21 Time: 10:06
Loading…
Cancel
Save