diff --git a/pom.xml b/pom.xml index 5974eae8..fa369f4a 100644 --- a/pom.xml +++ b/pom.xml @@ -27,6 +27,11 @@ <artifactId>httpclient</artifactId> <version>4.2.4</version> </dependency> + <dependency> + <groupId>net.sf.saxon</groupId> + <artifactId>Saxon-HE</artifactId> + <version>9.5.1-1</version> + </dependency> <dependency> <groupId>log4j</groupId> <artifactId>log4j</artifactId> @@ -45,7 +50,7 @@ <dependency> <groupId>net.sourceforge.htmlcleaner</groupId> <artifactId>htmlcleaner</artifactId> - <version>2.4</version> + <version>2.5</version> </dependency> <dependency> <groupId>org.apache.commons</groupId> diff --git a/webmagic-core/pom.xml b/webmagic-core/pom.xml index 60c37c02..a5fbd755 100644 --- a/webmagic-core/pom.xml +++ b/webmagic-core/pom.xml @@ -27,6 +27,11 @@ <artifactId>commons-lang3</artifactId> </dependency> + <dependency> + <groupId>net.sf.saxon</groupId> + <artifactId>Saxon-HE</artifactId> + </dependency> + <dependency> <groupId>log4j</groupId> <artifactId>log4j</artifactId> diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/selector/SaxonTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/selector/SaxonTest.java new file mode 100644 index 00000000..509be440 --- /dev/null +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/selector/SaxonTest.java @@ -0,0 +1,45 @@ +package us.codecraft.webmagic.selector; + +import net.sf.saxon.xpath.XPathFactoryImpl; +import org.htmlcleaner.CleanerProperties; +import org.htmlcleaner.DomSerializer; +import org.htmlcleaner.HtmlCleaner; +import org.htmlcleaner.TagNode; +import org.junit.Test; +import org.w3c.dom.Document; +import org.w3c.dom.NodeList; + +import javax.xml.xpath.XPath; +import javax.xml.xpath.XPathConstants; +import javax.xml.xpath.XPathExpression; +import javax.xml.xpath.XPathFactoryConfigurationException; + +/** + * @author yihua.huang@dianping.com <br> + * @date: 13-8-2 <br> + * Time: 下午5:48 <br> + */ +public class SaxonTest { + + @Test + public void test() throws XPathFactoryConfigurationException { +// System.setProperty("javax.xml.xpath.XPathFactory:" + NamespaceConstant.OBJECT_MODEL_SAXON, "net.sf.saxon.xpath.XPathFactoryImpl"); +// XPathFactory xpf = XPathFactory.newInstance(NamespaceConstant.OBJECT_MODEL_SAXON); + String xml = "<root><a>#BBB#</a><a>#CCC#</a><b><a>#DDD#</a></b></root>"; + try { + HtmlCleaner htmlCleaner = new HtmlCleaner(); + TagNode tagNode = htmlCleaner.clean(""); + Document document = new DomSerializer(new CleanerProperties()).createDOM(tagNode); + + javax.xml.xpath.XPathFactory factory = XPathFactoryImpl.newInstance(); + XPath xpath = factory.newXPath(); + XPathExpression expr = xpath.compile("//a[matches(.,'#...#')]"); + + Object result = expr.evaluate(document, XPathConstants.NODESET); + NodeList nodes = (NodeList) result; + System.out.println(nodes); + } catch (Exception e) { + e.printStackTrace(); + } + } +} diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java index 6f1c21ed..c2cc7eca 100644 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java @@ -1,7 +1,24 @@ package us.codecraft.webmagic.selector; +import net.sf.saxon.Configuration; +import net.sf.saxon.lib.NamespaceConstant; +import net.sf.saxon.om.NamespaceResolver; +import net.sf.saxon.pull.NamespaceContextImpl; +import net.sf.saxon.xpath.JAXPXPathStaticContext; +import net.sf.saxon.xpath.XPathEvaluator; +import net.sf.saxon.xpath.XPathFactoryImpl; +import org.htmlcleaner.CleanerProperties; +import org.htmlcleaner.DomSerializer; +import org.htmlcleaner.HtmlCleaner; +import org.htmlcleaner.TagNode; import org.junit.Assert; import org.junit.Test; +import org.w3c.dom.Document; +import org.w3c.dom.NodeList; + +import javax.xml.xpath.*; +import java.util.Collections; +import java.util.Iterator; /** * @author code4crafter@gmail.com <br> Date: 13-4-21 Time: 上午10:06 @@ -1354,4 +1371,50 @@ public class XpathSelectorTest { Assert.assertNotNull(html1.$("a[href]").xpath("//@href").all()); } + @Test + public void testXPath2() { + String text = "<h1>眉山:扎实推进农业农村工作 促农持续增收<br>\n" + + "<span>2013-07-31 23:29:45 来源:<a href=\"http://www.mshw.net\" target=\"_blank\" style=\"color:#AAA\">眉山网</a> 责任编辑:张斯炜</span></h1>"; + XpathSelector xpathSelector = new XpathSelector("//h1/text()"); + System.out.println(xpathSelector.select(text)); + } + + //http://sourceforge.net/mailarchive/forum.php?thread_name=4EA92A8A.6080202%40saxonica.com&forum_name=saxon-help + @Test + public void testSaxon() throws XPathFactoryConfigurationException { + System.setProperty("javax.xml.xpath.XPathFactory:" + NamespaceConstant.OBJECT_MODEL_SAXON, "net.sf.saxon.xpath.XPathFactoryImpl"); + System.setProperty("javax.xml.xpath.XPathFactory:" + NamespaceConstant.FN, "net.sf.saxon.xpath.XPathFactoryImpl"); + XPathFactory xpf = XPathFactory.newInstance(NamespaceConstant.OBJECT_MODEL_SAXON); + String text = "<h1>眉山:扎实推进农业农村工作 促农持续增收<br>\n" + + "<span>2013-07-31 23:29:45 来源:<a href=\"http://www.mshw.net\" target=\"_blank\" style=\"color:#AAA\">眉山网</a> 责任编辑:张斯炜</span></h1>"; + try { + HtmlCleaner htmlCleaner = new HtmlCleaner(); + TagNode tagNode = htmlCleaner.clean(text); + Document document = new DomSerializer(new CleanerProperties()).createDOM(tagNode); + javax.xml.xpath.XPathFactory factory = XPathFactoryImpl.newInstance(NamespaceConstant.OBJECT_MODEL_SAXON); + Configuration config = Configuration.newConfiguration(); + XPathEvaluator xPathEvaluator = new XPathEvaluator(config); + JAXPXPathStaticContext context = new JAXPXPathStaticContext(config); + context.setNamespaceContext(new NamespaceContextImpl(new NamespaceResolver() { + + + @Override + public String getURIForPrefix(String s, boolean b) { + return NamespaceConstant.FN; + } + + @Override + public Iterator<String> iteratePrefixes() { + return Collections.singletonList("fn").iterator(); + } + })); + xPathEvaluator.setStaticContext(context); + XPathExpression expr = xPathEvaluator.compile("fn:substring-before(//h1,'\n')"); + Object result = expr.evaluate(document, XPathConstants.STRING); + System.out.println(result); + } catch (Exception e) { + e.printStackTrace(); + } + } + }