From 268bd8d0c4b1e2385e2d5f97749869c06792122c Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Wed, 7 Aug 2013 23:04:10 +0800 Subject: [PATCH] remove saxon to extension --- webmagic-core/pom.xml | 5 --- .../us/codecraft/webmagic/selector/Html.java | 6 --- .../webmagic/selector/PlainText.java | 5 --- .../webmagic/selector/Selectable.java | 8 ---- .../webmagic/selector/SelectorFactory.java | 4 -- .../webmagic/selector/SaxonTest.java | 45 ------------------- webmagic-extension/pom.xml | 4 ++ .../webmagic/selector/Xpath2Selector.java | 0 .../webmagic/selector/XpathSelectorTest.java | 36 --------------- 9 files changed, 4 insertions(+), 109 deletions(-) delete mode 100644 webmagic-core/src/test/java/us/codecraft/webmagic/selector/SaxonTest.java rename {webmagic-core => webmagic-extension}/src/main/java/us/codecraft/webmagic/selector/Xpath2Selector.java (100%) rename {webmagic-core => webmagic-extension}/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java (98%) diff --git a/webmagic-core/pom.xml b/webmagic-core/pom.xml index d2c48b24..cf42d2a9 100644 --- a/webmagic-core/pom.xml +++ b/webmagic-core/pom.xml @@ -27,11 +27,6 @@ commons-lang3 - - net.sf.saxon - Saxon-HE - - log4j log4j diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java index 79d62a01..114eef99 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java @@ -63,12 +63,6 @@ public class Html extends PlainText { return selectList(xpathSelector, strings); } - @Override - public Selectable xpath2(String xpath) { - Xpath2Selector xpathSelector = SelectorFactory.getInstatnce().newXpath2Selector(xpath); - return selectList(xpathSelector, strings); - } - @Override public Selectable $(String selector) { CssSelector cssSelector = new CssSelector(selector); diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/PlainText.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/PlainText.java index 4fff6da8..d06a5310 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/PlainText.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/PlainText.java @@ -34,11 +34,6 @@ public class PlainText implements Selectable { throw new UnsupportedOperationException(); } - @Override - public Selectable xpath2(String xpath) { - throw new UnsupportedOperationException(); - } - @Override public Selectable $(String selector) { throw new UnsupportedOperationException(); diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectable.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectable.java index cea501dd..42f3d108 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectable.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectable.java @@ -18,14 +18,6 @@ public interface Selectable { */ public Selectable xpath(String xpath); - /** - * select list with xpath 2.0 syntax - * - * @param xpath - * @return new Selectable after extract - */ - public Selectable xpath2(String xpath); - /** * select list with css selector * diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/SelectorFactory.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/SelectorFactory.java index 9abb1ce3..1dd56e01 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/SelectorFactory.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/SelectorFactory.java @@ -34,10 +34,6 @@ public class SelectorFactory { return newSelector(XpathSelector.class, xpath); } - public Xpath2Selector newXpath2Selector(String xpath) { - return newSelector(Xpath2Selector.class, xpath); - } - public SmartContentSelector newSmartContentSelector(){ return newSelector(SmartContentSelector.class); } diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/selector/SaxonTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/selector/SaxonTest.java deleted file mode 100644 index 05a89063..00000000 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/selector/SaxonTest.java +++ /dev/null @@ -1,45 +0,0 @@ -package us.codecraft.webmagic.selector; - -import net.sf.saxon.xpath.XPathFactoryImpl; -import org.htmlcleaner.CleanerProperties; -import org.htmlcleaner.DomSerializer; -import org.htmlcleaner.HtmlCleaner; -import org.htmlcleaner.TagNode; -import org.junit.Test; -import org.w3c.dom.Document; -import org.w3c.dom.NodeList; - -import javax.xml.xpath.XPath; -import javax.xml.xpath.XPathConstants; -import javax.xml.xpath.XPathExpression; -import javax.xml.xpath.XPathFactoryConfigurationException; - -/** - * @author code4crafter@gmail.com
- * @date: 13-8-2
- * Time: 下午5:48
- */ -public class SaxonTest { - - @Test - public void test() throws XPathFactoryConfigurationException { -// System.setProperty("javax.xml.xpath.XPathFactory:" + NamespaceConstant.OBJECT_MODEL_SAXON, "net.sf.saxon.xpath.XPathFactoryImpl"); -// XPathFactory xpf = XPathFactory.newInstance(NamespaceConstant.OBJECT_MODEL_SAXON); - String xml = "#BBB##CCC##DDD#"; - try { - HtmlCleaner htmlCleaner = new HtmlCleaner(); - TagNode tagNode = htmlCleaner.clean(""); - Document document = new DomSerializer(new CleanerProperties()).createDOM(tagNode); - - javax.xml.xpath.XPathFactory factory = XPathFactoryImpl.newInstance(); - XPath xpath = factory.newXPath(); - XPathExpression expr = xpath.compile("//a[matches(.,'#...#')]"); - - Object result = expr.evaluate(document, XPathConstants.NODESET); - NodeList nodes = (NodeList) result; - System.out.println(nodes); - } catch (Exception e) { - e.printStackTrace(); - } - } -} diff --git a/webmagic-extension/pom.xml b/webmagic-extension/pom.xml index 63034f23..843c2c3c 100644 --- a/webmagic-extension/pom.xml +++ b/webmagic-extension/pom.xml @@ -27,6 +27,10 @@ webmagic-core ${project.version}
+ + net.sf.saxon + Saxon-HE + junit junit diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Xpath2Selector.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/selector/Xpath2Selector.java similarity index 100% rename from webmagic-core/src/main/java/us/codecraft/webmagic/selector/Xpath2Selector.java rename to webmagic-extension/src/main/java/us/codecraft/webmagic/selector/Xpath2Selector.java diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java b/webmagic-extension/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java similarity index 98% rename from webmagic-core/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java rename to webmagic-extension/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java index 2f663c99..9f32a8f1 100644 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java +++ b/webmagic-extension/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java @@ -1380,42 +1380,6 @@ public class XpathSelectorTest { System.out.println(xpathSelector.select(text)); } - //http://sourceforge.net/mailarchive/forum.php?thread_name=4EA92A8A.6080202%40saxonica.com&forum_name=saxon-help - @Test - public void testSaxon() { - String text = "

眉山:扎实推进农业农村工作 促农持续增收
\n" + - "2013-07-31 23:29:45   来源:眉山网      责任编辑:张斯炜

"; - try { - HtmlCleaner htmlCleaner = new HtmlCleaner(); - TagNode tagNode = htmlCleaner.clean(text); - Document document = new DomSerializer(new CleanerProperties()).createDOM(tagNode); - XPathEvaluator xPathEvaluator = new XPathEvaluator(); - xPathEvaluator.setNamespaceContext(new NamespaceContextImpl(new NamespaceResolver() { - - - @Override - public String getURIForPrefix(String s, boolean b) { - return NamespaceConstant.FN; - } - - @Override - public Iterator iteratePrefixes() { - return Collections.singletonList("fn").iterator(); - } - })); - XPathExpression expr = xPathEvaluator.compile("fn:substring-before(//h1,'\n')"); - Object result = expr.evaluate(document, XPathConstants.STRING); - Assert.assertNotNull(result); - } catch (Exception e) { - e.printStackTrace(); - } - Xpath2Selector xpath2Selector = new Xpath2Selector("fn:substring-before(//h1,'\n')"); - String select = xpath2Selector.select(text); - Assert.assertNotNull(select); - Assert.assertNotNull(xpath2Selector.selectList(text)); - - } - @Test public void testXpath2Selector() { Xpath2Selector xpath2Selector = new Xpath2Selector("//a/@href");