From d7899e94aeab00d0be2aaed2989c814f08e2cf2b Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Fri, 2 Aug 2013 23:39:34 +0800 Subject: [PATCH] test saxon and invite XPath2.0 support --- pom.xml | 7 ++- webmagic-core/pom.xml | 5 ++ .../webmagic/selector/SaxonTest.java | 45 +++++++++++++ .../webmagic/selector/XpathSelectorTest.java | 63 +++++++++++++++++++ 4 files changed, 119 insertions(+), 1 deletion(-) create mode 100644 webmagic-core/src/test/java/us/codecraft/webmagic/selector/SaxonTest.java diff --git a/pom.xml b/pom.xml index 5974eae8..fa369f4a 100644 --- a/pom.xml +++ b/pom.xml @@ -27,6 +27,11 @@ httpclient 4.2.4 + + net.sf.saxon + Saxon-HE + 9.5.1-1 + log4j log4j @@ -45,7 +50,7 @@ net.sourceforge.htmlcleaner htmlcleaner - 2.4 + 2.5 org.apache.commons diff --git a/webmagic-core/pom.xml b/webmagic-core/pom.xml index 60c37c02..a5fbd755 100644 --- a/webmagic-core/pom.xml +++ b/webmagic-core/pom.xml @@ -27,6 +27,11 @@ commons-lang3 + + net.sf.saxon + Saxon-HE + + log4j log4j diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/selector/SaxonTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/selector/SaxonTest.java new file mode 100644 index 00000000..509be440 --- /dev/null +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/selector/SaxonTest.java @@ -0,0 +1,45 @@ +package us.codecraft.webmagic.selector; + +import net.sf.saxon.xpath.XPathFactoryImpl; +import org.htmlcleaner.CleanerProperties; +import org.htmlcleaner.DomSerializer; +import org.htmlcleaner.HtmlCleaner; +import org.htmlcleaner.TagNode; +import org.junit.Test; +import org.w3c.dom.Document; +import org.w3c.dom.NodeList; + +import javax.xml.xpath.XPath; +import javax.xml.xpath.XPathConstants; +import javax.xml.xpath.XPathExpression; +import javax.xml.xpath.XPathFactoryConfigurationException; + +/** + * @author yihua.huang@dianping.com
+ * @date: 13-8-2
+ * Time: 下午5:48
+ */ +public class SaxonTest { + + @Test + public void test() throws XPathFactoryConfigurationException { +// System.setProperty("javax.xml.xpath.XPathFactory:" + NamespaceConstant.OBJECT_MODEL_SAXON, "net.sf.saxon.xpath.XPathFactoryImpl"); +// XPathFactory xpf = XPathFactory.newInstance(NamespaceConstant.OBJECT_MODEL_SAXON); + String xml = "#BBB##CCC##DDD#"; + try { + HtmlCleaner htmlCleaner = new HtmlCleaner(); + TagNode tagNode = htmlCleaner.clean(""); + Document document = new DomSerializer(new CleanerProperties()).createDOM(tagNode); + + javax.xml.xpath.XPathFactory factory = XPathFactoryImpl.newInstance(); + XPath xpath = factory.newXPath(); + XPathExpression expr = xpath.compile("//a[matches(.,'#...#')]"); + + Object result = expr.evaluate(document, XPathConstants.NODESET); + NodeList nodes = (NodeList) result; + System.out.println(nodes); + } catch (Exception e) { + e.printStackTrace(); + } + } +} diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java index 6f1c21ed..c2cc7eca 100644 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java @@ -1,7 +1,24 @@ package us.codecraft.webmagic.selector; +import net.sf.saxon.Configuration; +import net.sf.saxon.lib.NamespaceConstant; +import net.sf.saxon.om.NamespaceResolver; +import net.sf.saxon.pull.NamespaceContextImpl; +import net.sf.saxon.xpath.JAXPXPathStaticContext; +import net.sf.saxon.xpath.XPathEvaluator; +import net.sf.saxon.xpath.XPathFactoryImpl; +import org.htmlcleaner.CleanerProperties; +import org.htmlcleaner.DomSerializer; +import org.htmlcleaner.HtmlCleaner; +import org.htmlcleaner.TagNode; import org.junit.Assert; import org.junit.Test; +import org.w3c.dom.Document; +import org.w3c.dom.NodeList; + +import javax.xml.xpath.*; +import java.util.Collections; +import java.util.Iterator; /** * @author code4crafter@gmail.com
Date: 13-4-21 Time: 上午10:06 @@ -1354,4 +1371,50 @@ public class XpathSelectorTest { Assert.assertNotNull(html1.$("a[href]").xpath("//@href").all()); } + @Test + public void testXPath2() { + String text = "

眉山:扎实推进农业农村工作 促农持续增收
\n" + + "2013-07-31 23:29:45   来源:眉山网      责任编辑:张斯炜

"; + XpathSelector xpathSelector = new XpathSelector("//h1/text()"); + System.out.println(xpathSelector.select(text)); + } + + //http://sourceforge.net/mailarchive/forum.php?thread_name=4EA92A8A.6080202%40saxonica.com&forum_name=saxon-help + @Test + public void testSaxon() throws XPathFactoryConfigurationException { + System.setProperty("javax.xml.xpath.XPathFactory:" + NamespaceConstant.OBJECT_MODEL_SAXON, "net.sf.saxon.xpath.XPathFactoryImpl"); + System.setProperty("javax.xml.xpath.XPathFactory:" + NamespaceConstant.FN, "net.sf.saxon.xpath.XPathFactoryImpl"); + XPathFactory xpf = XPathFactory.newInstance(NamespaceConstant.OBJECT_MODEL_SAXON); + String text = "

眉山:扎实推进农业农村工作 促农持续增收
\n" + + "2013-07-31 23:29:45   来源:眉山网      责任编辑:张斯炜

"; + try { + HtmlCleaner htmlCleaner = new HtmlCleaner(); + TagNode tagNode = htmlCleaner.clean(text); + Document document = new DomSerializer(new CleanerProperties()).createDOM(tagNode); + javax.xml.xpath.XPathFactory factory = XPathFactoryImpl.newInstance(NamespaceConstant.OBJECT_MODEL_SAXON); + Configuration config = Configuration.newConfiguration(); + XPathEvaluator xPathEvaluator = new XPathEvaluator(config); + JAXPXPathStaticContext context = new JAXPXPathStaticContext(config); + context.setNamespaceContext(new NamespaceContextImpl(new NamespaceResolver() { + + + @Override + public String getURIForPrefix(String s, boolean b) { + return NamespaceConstant.FN; + } + + @Override + public Iterator iteratePrefixes() { + return Collections.singletonList("fn").iterator(); + } + })); + xPathEvaluator.setStaticContext(context); + XPathExpression expr = xPathEvaluator.compile("fn:substring-before(//h1,'\n')"); + Object result = expr.evaluate(document, XPathConstants.STRING); + System.out.println(result); + } catch (Exception e) { + e.printStackTrace(); + } + } + }