test saxon and invite XPath2.0 support

pull/17/head
yihua.huang 12 years ago
parent 3fe3d8f044
commit d7899e94ae

@ -27,6 +27,11 @@
<artifactId>httpclient</artifactId>
<version>4.2.4</version>
</dependency>
<dependency>
<groupId>net.sf.saxon</groupId>
<artifactId>Saxon-HE</artifactId>
<version>9.5.1-1</version>
</dependency>
<dependency>
<groupId>log4j</groupId>
<artifactId>log4j</artifactId>
@ -45,7 +50,7 @@
<dependency>
<groupId>net.sourceforge.htmlcleaner</groupId>
<artifactId>htmlcleaner</artifactId>
<version>2.4</version>
<version>2.5</version>
</dependency>
<dependency>
<groupId>org.apache.commons</groupId>

@ -27,6 +27,11 @@
<artifactId>commons-lang3</artifactId>
</dependency>
<dependency>
<groupId>net.sf.saxon</groupId>
<artifactId>Saxon-HE</artifactId>
</dependency>
<dependency>
<groupId>log4j</groupId>
<artifactId>log4j</artifactId>

@ -0,0 +1,45 @@
package us.codecraft.webmagic.selector;
import net.sf.saxon.xpath.XPathFactoryImpl;
import org.htmlcleaner.CleanerProperties;
import org.htmlcleaner.DomSerializer;
import org.htmlcleaner.HtmlCleaner;
import org.htmlcleaner.TagNode;
import org.junit.Test;
import org.w3c.dom.Document;
import org.w3c.dom.NodeList;
import javax.xml.xpath.XPath;
import javax.xml.xpath.XPathConstants;
import javax.xml.xpath.XPathExpression;
import javax.xml.xpath.XPathFactoryConfigurationException;
/**
* @author yihua.huang@dianping.com <br>
* @date: 13-8-2 <br>
* Time: 5:48 <br>
*/
public class SaxonTest {
@Test
public void test() throws XPathFactoryConfigurationException {
// System.setProperty("javax.xml.xpath.XPathFactory:" + NamespaceConstant.OBJECT_MODEL_SAXON, "net.sf.saxon.xpath.XPathFactoryImpl");
// XPathFactory xpf = XPathFactory.newInstance(NamespaceConstant.OBJECT_MODEL_SAXON);
String xml = "<root><a>#BBB#</a><a>#CCC#</a><b><a>#DDD#</a></b></root>";
try {
HtmlCleaner htmlCleaner = new HtmlCleaner();
TagNode tagNode = htmlCleaner.clean("");
Document document = new DomSerializer(new CleanerProperties()).createDOM(tagNode);
javax.xml.xpath.XPathFactory factory = XPathFactoryImpl.newInstance();
XPath xpath = factory.newXPath();
XPathExpression expr = xpath.compile("//a[matches(.,'#...#')]");
Object result = expr.evaluate(document, XPathConstants.NODESET);
NodeList nodes = (NodeList) result;
System.out.println(nodes);
} catch (Exception e) {
e.printStackTrace();
}
}
}

@ -1,7 +1,24 @@
package us.codecraft.webmagic.selector;
import net.sf.saxon.Configuration;
import net.sf.saxon.lib.NamespaceConstant;
import net.sf.saxon.om.NamespaceResolver;
import net.sf.saxon.pull.NamespaceContextImpl;
import net.sf.saxon.xpath.JAXPXPathStaticContext;
import net.sf.saxon.xpath.XPathEvaluator;
import net.sf.saxon.xpath.XPathFactoryImpl;
import org.htmlcleaner.CleanerProperties;
import org.htmlcleaner.DomSerializer;
import org.htmlcleaner.HtmlCleaner;
import org.htmlcleaner.TagNode;
import org.junit.Assert;
import org.junit.Test;
import org.w3c.dom.Document;
import org.w3c.dom.NodeList;
import javax.xml.xpath.*;
import java.util.Collections;
import java.util.Iterator;
/**
* @author code4crafter@gmail.com <br> Date: 13-4-21 Time: 10:06
@ -1354,4 +1371,50 @@ public class XpathSelectorTest {
Assert.assertNotNull(html1.$("a[href]").xpath("//@href").all());
}
@Test
public void testXPath2() {
String text = "<h1>眉山:扎实推进农业农村工作 促农持续增收<br>\n" +
"<span>2013-07-31 23:29:45&nbsp;&nbsp;&nbsp;来源:<a href=\"http://www.mshw.net\" target=\"_blank\" style=\"color:#AAA\">眉山网</a>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;责任编辑:张斯炜</span></h1>";
XpathSelector xpathSelector = new XpathSelector("//h1/text()");
System.out.println(xpathSelector.select(text));
}
//http://sourceforge.net/mailarchive/forum.php?thread_name=4EA92A8A.6080202%40saxonica.com&forum_name=saxon-help
@Test
public void testSaxon() throws XPathFactoryConfigurationException {
System.setProperty("javax.xml.xpath.XPathFactory:" + NamespaceConstant.OBJECT_MODEL_SAXON, "net.sf.saxon.xpath.XPathFactoryImpl");
System.setProperty("javax.xml.xpath.XPathFactory:" + NamespaceConstant.FN, "net.sf.saxon.xpath.XPathFactoryImpl");
XPathFactory xpf = XPathFactory.newInstance(NamespaceConstant.OBJECT_MODEL_SAXON);
String text = "<h1>眉山:扎实推进农业农村工作 促农持续增收<br>\n" +
"<span>2013-07-31 23:29:45&nbsp;&nbsp;&nbsp;来源:<a href=\"http://www.mshw.net\" target=\"_blank\" style=\"color:#AAA\">眉山网</a>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;责任编辑:张斯炜</span></h1>";
try {
HtmlCleaner htmlCleaner = new HtmlCleaner();
TagNode tagNode = htmlCleaner.clean(text);
Document document = new DomSerializer(new CleanerProperties()).createDOM(tagNode);
javax.xml.xpath.XPathFactory factory = XPathFactoryImpl.newInstance(NamespaceConstant.OBJECT_MODEL_SAXON);
Configuration config = Configuration.newConfiguration();
XPathEvaluator xPathEvaluator = new XPathEvaluator(config);
JAXPXPathStaticContext context = new JAXPXPathStaticContext(config);
context.setNamespaceContext(new NamespaceContextImpl(new NamespaceResolver() {
@Override
public String getURIForPrefix(String s, boolean b) {
return NamespaceConstant.FN;
}
@Override
public Iterator<String> iteratePrefixes() {
return Collections.singletonList("fn").iterator();
}
}));
xPathEvaluator.setStaticContext(context);
XPathExpression expr = xPathEvaluator.compile("fn:substring-before(//h1,'\n')");
Object result = expr.evaluate(document, XPathConstants.STRING);
System.out.println(result);
} catch (Exception e) {
e.printStackTrace();
}
}
}

Loading…
Cancel
Save