diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Xpath2Selector.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Xpath2Selector.java index 99112cae..98b1efe4 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Xpath2Selector.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Xpath2Selector.java @@ -8,6 +8,7 @@ import org.htmlcleaner.DomSerializer; import org.htmlcleaner.HtmlCleaner; import org.htmlcleaner.TagNode; import org.w3c.dom.Document; +import org.w3c.dom.Node; import org.w3c.dom.NodeList; import javax.xml.namespace.NamespaceContext; @@ -70,7 +71,7 @@ public class Xpath2Selector implements Selector { private XPath2NamespaceContext() { put("fn", NamespaceConstant.FN); - put("xslt",NamespaceConstant.XSLT); + put("xslt", NamespaceConstant.XSLT); } @Override @@ -116,15 +117,20 @@ public class Xpath2Selector implements Selector { result = xPathExpression.evaluate(document, XPathConstants.STRING); } if (result instanceof NodeList) { - StreamResult xmlOutput = new StreamResult(new StringWriter()); - Transformer transformer = TransformerFactory.newInstance().newTransformer(); - transformer.setOutputProperty(OutputKeys.OMIT_XML_DECLARATION, "yes"); NodeList nodeList = (NodeList) result; if (nodeList.getLength() == 0) { return null; } - transformer.transform(new DOMSource(nodeList.item(0)), xmlOutput); - return xmlOutput.getWriter().toString(); + Node item = nodeList.item(0); + if (item.getNodeType() == Node.ATTRIBUTE_NODE || item.getNodeType() == Node.TEXT_NODE) { + return item.getTextContent(); + } else { + StreamResult xmlOutput = new StreamResult(new StringWriter()); + Transformer transformer = TransformerFactory.newInstance().newTransformer(); + transformer.setOutputProperty(OutputKeys.OMIT_XML_DECLARATION, "yes"); + transformer.transform(new DOMSource(item), xmlOutput); + return xmlOutput.getWriter().toString(); + } } return result.toString(); } catch (Exception e) { @@ -152,9 +158,14 @@ public class Xpath2Selector implements Selector { StreamResult xmlOutput = new StreamResult(); transformer.setOutputProperty(OutputKeys.OMIT_XML_DECLARATION, "yes"); for (int i = 0; i < nodeList.getLength(); i++) { - xmlOutput.setWriter(new StringWriter()); - transformer.transform(new DOMSource(nodeList.item(i)), xmlOutput); - results.add(xmlOutput.getWriter().toString()); + Node item = nodeList.item(i); + if (item.getNodeType() == Node.ATTRIBUTE_NODE || item.getNodeType() == Node.TEXT_NODE) { + results.add(item.getTextContent()); + } else { + xmlOutput.setWriter(new StringWriter()); + transformer.transform(new DOMSource(item), xmlOutput); + results.add(xmlOutput.getWriter().toString()); + } } } else { results.add(result.toString()); diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/oo/OschinaBlog.java b/webmagic-core/src/test/java/us/codecraft/webmagic/oo/OschinaBlog.java index 0f64aef8..98543b09 100644 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/oo/OschinaBlog.java +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/oo/OschinaBlog.java @@ -1,6 +1,7 @@ package us.codecraft.webmagic.oo; import us.codecraft.webmagic.Page; +import us.codecraft.webmagic.Site; import java.util.List; @@ -9,7 +10,7 @@ import java.util.List; * @date: 13-8-1
* Time: 下午10:18
*/ -@TargetUrl(value="http://my.oschina.net/flashsword/blog/*",sourceRegion = "//div[@class='BlogLinks']") +@TargetUrl(value="http://my.oschina.net/flashsword/blog/*",sourceRegion = "//div[@class='BlogLinks']//a/@href") public class OschinaBlog implements AfterExtractor { @ExtractBy("//title") @@ -23,6 +24,13 @@ public class OschinaBlog implements AfterExtractor { @Override public void afterProcess(Page page) { - content = null; + System.out.println("title:\t"+title); + System.out.println("content:\t"+content); + System.out.println("tags:\t" + tags); } -} + + public static void main(String[] args) { + OOSpider.create(Site.me().addStartUrl("http://my.oschina.net/flashsword/blog/145796"), OschinaBlog.class) + .run(); + } +} \ No newline at end of file diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java index 2b8e15de..2f663c99 100644 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java @@ -1418,7 +1418,7 @@ public class XpathSelectorTest { @Test public void testXpath2Selector() { - Xpath2Selector xpath2Selector = new Xpath2Selector("//a"); + Xpath2Selector xpath2Selector = new Xpath2Selector("//a/@href"); String select = xpath2Selector.select(html); Assert.assertNotNull(select); }