向 webmagic-saxon 组件提供若干新 API,更优雅更灵活更强大 (#1108)

* Feature:
* webmagic-saxon 组件新增若干新 API;

* Update: 更优雅的写代码。

* Update: JaxpSelectorUtils 工具类增加 final 关键字。
pull/1112/head
hooy 2 years ago committed by GitHub
parent f47038db63
commit 717931166a
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -0,0 +1,61 @@
package us.codecraft.webmagic.selector;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import javax.xml.transform.OutputKeys;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerException;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;
import java.io.StringWriter;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
/**
* @author hooy
*/
public final class JaxpSelectorUtils {
private JaxpSelectorUtils() {
throw new RuntimeException("The util class cannot be instanced");
}
public static List<Node> NodeListToArrayList(NodeList nodes) {
List<Node> list = new ArrayList<>(nodes.getLength());
for (int i = 0; i < nodes.getLength(); i++) {
list.add(nodes.item(i));
}
return list;
}
public static String nodeToString(Node node) throws TransformerException {
List<Node> before = Collections.singletonList(node);
List<String> after = nodesToStrings(before);
if (after.size() > 0) {
return after.get(0);
} else {
return null;
}
}
public static List<String> nodesToStrings(List<Node> nodes) throws TransformerException {
List<String> results = new ArrayList<>(nodes.size());
Transformer transformer = TransformerFactory.newInstance().newTransformer();
StreamResult xmlOutput = new StreamResult();
transformer.setOutputProperty(OutputKeys.OMIT_XML_DECLARATION, "yes");
for (Node node : nodes) {
if (node.getNodeType() == Node.ATTRIBUTE_NODE || node.getNodeType() == Node.TEXT_NODE) {
results.add(node.getTextContent());
} else {
xmlOutput.setWriter(new StringWriter());
transformer.transform(new DOMSource(node), xmlOutput);
results.add(xmlOutput.getWriter().toString());
}
}
return results;
}
}

@ -0,0 +1,32 @@
package us.codecraft.webmagic.selector;
import org.w3c.dom.Node;
import java.util.List;
/**
* Selector(extractor) for html node.<br>
*
* @author hooy <br>
* @since 0.8.0
*/
public interface NodeSelector {
/**
* Extract single result in text.<br>
* If there are more than one result, only the first will be chosen.
*
* @param node node
* @return result
*/
String select(Node node);
/**
* Extract all results in text.<br>
*
* @param node node
* @return results
*/
List<String> selectList(Node node);
}

@ -1,19 +1,10 @@
package us.codecraft.webmagic.selector; package us.codecraft.webmagic.selector;
import java.io.StringWriter; import java.util.*;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.ConcurrentHashMap;
import javax.xml.namespace.NamespaceContext; import javax.xml.namespace.NamespaceContext;
import javax.xml.parsers.ParserConfigurationException; import javax.xml.parsers.ParserConfigurationException;
import javax.xml.transform.OutputKeys;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;
import javax.xml.xpath.XPathConstants; import javax.xml.xpath.XPathConstants;
import javax.xml.xpath.XPathExpression; import javax.xml.xpath.XPathExpression;
import javax.xml.xpath.XPathExpressionException; import javax.xml.xpath.XPathExpressionException;
@ -32,20 +23,22 @@ import net.sf.saxon.lib.NamespaceConstant;
import net.sf.saxon.xpath.XPathEvaluator; import net.sf.saxon.xpath.XPathEvaluator;
import us.codecraft.webmagic.utils.BaseSelectorUtils; import us.codecraft.webmagic.utils.BaseSelectorUtils;
import static us.codecraft.webmagic.selector.JaxpSelectorUtils.*;
/** /**
* xpath2.0HtmlCleanerSaxon HE<br> * xpath2.0HtmlCleanerSaxon HE<br>
* *
* @author code4crafter@gmail.com <br> * @author code4crafter@gmail.com, hooy <br>
* Date: 13-4-21 * Date: 13-4-21
* Time: 9:39 * Time: 9:39
*/ */
public class Xpath2Selector implements Selector { public class Xpath2Selector implements Selector, NodeSelector {
private String xpathStr; private final String xpathStr;
private XPathExpression xPathExpression; private XPathExpression xPathExpression;
private Logger logger = LoggerFactory.getLogger(getClass()); private final Logger logger = LoggerFactory.getLogger(getClass());
public Xpath2Selector(String xpathStr) { public Xpath2Selector(String xpathStr) {
this.xpathStr = xpathStr; this.xpathStr = xpathStr;
@ -56,25 +49,25 @@ public class Xpath2Selector implements Selector {
} }
} }
public static Xpath2Selector newInstance(String xpathStr) {
return new Xpath2Selector(xpathStr);
}
enum XPath2NamespaceContext implements NamespaceContext { enum XPath2NamespaceContext implements NamespaceContext {
INSTANCE; INSTANCE;
private final Map<String, String> prefix2NamespaceMap = new ConcurrentHashMap<String, String>(); private final Map<String, String> prefix2NamespaceMap = new ConcurrentHashMap<>();
private final Map<String, List<String>> namespace2PrefixMap = new ConcurrentHashMap<String, List<String>>(); private final Map<String, List<String>> namespace2PrefixMap = new ConcurrentHashMap<>();
private void put(String prefix, String namespaceURI) { private void put(String prefix, String namespaceURI) {
prefix2NamespaceMap.put(prefix, namespaceURI); prefix2NamespaceMap.put(prefix, namespaceURI);
List<String> prefixes = namespace2PrefixMap.get(namespaceURI); List<String> prefixes = namespace2PrefixMap.computeIfAbsent(namespaceURI, k -> new ArrayList<>());
if (prefixes == null) {
prefixes = new ArrayList<String>();
namespace2PrefixMap.put(namespaceURI, prefixes);
}
prefixes.add(prefix); prefixes.add(prefix);
} }
private XPath2NamespaceContext() { XPath2NamespaceContext() {
put("fn", NamespaceConstant.FN); put("fn", NamespaceConstant.FN);
put("xslt", NamespaceConstant.XSLT); put("xslt", NamespaceConstant.XSLT);
put("xhtml", NamespaceConstant.XHTML); put("xhtml", NamespaceConstant.XHTML);
@ -113,29 +106,29 @@ public class Xpath2Selector implements Selector {
@Override @Override
public String select(String text) { public String select(String text) {
try { try {
Object result; Document doc = parse(text);
try { return select(doc);
result = xPathExpression.evaluate(parse(text), XPathConstants.NODESET); } catch (Exception e) {
} catch (XPathExpressionException e) { logger.error("select text error! " + xpathStr, e);
result = xPathExpression.evaluate(parse(text), XPathConstants.STRING);
} }
if (result instanceof NodeList) {
NodeList nodeList = (NodeList) result;
if (nodeList.getLength() == 0) {
return null; return null;
} }
Node item = nodeList.item(0);
if (item.getNodeType() == Node.ATTRIBUTE_NODE || item.getNodeType() == Node.TEXT_NODE) { @Override
return item.getTextContent(); public String select(Node node) {
} else { try {
StreamResult xmlOutput = new StreamResult(new StringWriter()); return (String) xPathExpression.evaluate(node, XPathConstants.STRING);
Transformer transformer = TransformerFactory.newInstance().newTransformer(); } catch (Exception e) {
transformer.setOutputProperty(OutputKeys.OMIT_XML_DECLARATION, "yes"); logger.error("select text error! " + xpathStr, e);
transformer.transform(new DOMSource(item), xmlOutput);
return xmlOutput.getWriter().toString();
} }
return null;
} }
return result.toString();
@Override
public List<String> selectList(String text) {
try {
Document doc = parse(text);
return selectList(doc);
} catch (Exception e) { } catch (Exception e) {
logger.error("select text error! " + xpathStr, e); logger.error("select text error! " + xpathStr, e);
} }
@ -143,44 +136,62 @@ public class Xpath2Selector implements Selector {
} }
@Override @Override
public List<String> selectList(String text) { public List<String> selectList(Node node) {
List<String> results = new ArrayList<String>();
try { try {
Object result; NodeList result = (NodeList) xPathExpression.evaluate(node, XPathConstants.NODESET);
List<Node> nodes = NodeListToArrayList(result);
return nodesToStrings(nodes);
} catch (Exception e) {
logger.error("select text error! " + xpathStr, e);
}
return null;
}
public Node selectNode(String text) {
try { try {
result = xPathExpression.evaluate(parse(text), XPathConstants.NODESET); Document doc = parse(text);
} catch (XPathExpressionException e) { return selectNode(doc);
result = xPathExpression.evaluate(parse(text), XPathConstants.STRING); } catch (Exception e) {
} logger.error("select text error! " + xpathStr, e);
if (result instanceof NodeList) { }
NodeList nodeList = (NodeList) result; return null;
Transformer transformer = TransformerFactory.newInstance().newTransformer(); }
StreamResult xmlOutput = new StreamResult();
transformer.setOutputProperty(OutputKeys.OMIT_XML_DECLARATION, "yes"); public Node selectNode(Node node) {
for (int i = 0; i < nodeList.getLength(); i++) { try {
Node item = nodeList.item(i); return (Node) xPathExpression.evaluate(node, XPathConstants.NODE);
if (item.getNodeType() == Node.ATTRIBUTE_NODE || item.getNodeType() == Node.TEXT_NODE) { } catch (Exception e) {
results.add(item.getTextContent()); logger.error("select text error! " + xpathStr, e);
} else { }
xmlOutput.setWriter(new StringWriter()); return null;
transformer.transform(new DOMSource(item), xmlOutput); }
results.add(xmlOutput.getWriter().toString());
} public List<Node> selectNodes(String text) {
} try {
} else { Document doc = parse(text);
results.add(result.toString()); return selectNodes(doc);
} catch (Exception e) {
logger.error("select text error! " + xpathStr, e);
} }
return null;
}
public List<Node> selectNodes(Node node) {
try {
NodeList result = (NodeList) xPathExpression.evaluate(node, XPathConstants.NODESET);
return NodeListToArrayList(result);
} catch (Exception e) { } catch (Exception e) {
logger.error("select text error! " + xpathStr, e); logger.error("select text error! " + xpathStr, e);
} }
return results; return null;
} }
private Document parse(String text) throws ParserConfigurationException { protected static Document parse(String text) throws ParserConfigurationException {
// HtmlCleaner could not parse <tr></tr> or <td></td> tag directly // HtmlCleaner could not parse <tr></tr> or <td></td> tag directly
text = BaseSelectorUtils.preParse(text); text = BaseSelectorUtils.preParse(text);
HtmlCleaner htmlCleaner = new HtmlCleaner(); HtmlCleaner htmlCleaner = new HtmlCleaner();
TagNode tagNode = htmlCleaner.clean(text); TagNode tagNode = htmlCleaner.clean(text);
return new DomSerializer(new CleanerProperties()).createDOM(tagNode); return new DomSerializer(new CleanerProperties()).createDOM(tagNode);
} }
} }

File diff suppressed because one or more lines are too long
Loading…
Cancel
Save