向 webmagic-saxon 组件提供若干新 API,更优雅更灵活更强大 (#1108)
* Feature: * webmagic-saxon 组件新增若干新 API; * Update: 更优雅的写代码。 * Update: JaxpSelectorUtils 工具类增加 final 关键字。pull/1112/head
parent
f47038db63
commit
717931166a
@ -0,0 +1,61 @@
|
||||
package us.codecraft.webmagic.selector;
|
||||
|
||||
import org.w3c.dom.Node;
|
||||
import org.w3c.dom.NodeList;
|
||||
|
||||
import javax.xml.transform.OutputKeys;
|
||||
import javax.xml.transform.Transformer;
|
||||
import javax.xml.transform.TransformerException;
|
||||
import javax.xml.transform.TransformerFactory;
|
||||
import javax.xml.transform.dom.DOMSource;
|
||||
import javax.xml.transform.stream.StreamResult;
|
||||
import java.io.StringWriter;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* @author hooy
|
||||
*/
|
||||
public final class JaxpSelectorUtils {
|
||||
|
||||
private JaxpSelectorUtils() {
|
||||
throw new RuntimeException("The util class cannot be instanced");
|
||||
}
|
||||
|
||||
public static List<Node> NodeListToArrayList(NodeList nodes) {
|
||||
List<Node> list = new ArrayList<>(nodes.getLength());
|
||||
for (int i = 0; i < nodes.getLength(); i++) {
|
||||
list.add(nodes.item(i));
|
||||
}
|
||||
return list;
|
||||
}
|
||||
|
||||
public static String nodeToString(Node node) throws TransformerException {
|
||||
List<Node> before = Collections.singletonList(node);
|
||||
List<String> after = nodesToStrings(before);
|
||||
if (after.size() > 0) {
|
||||
return after.get(0);
|
||||
} else {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
public static List<String> nodesToStrings(List<Node> nodes) throws TransformerException {
|
||||
List<String> results = new ArrayList<>(nodes.size());
|
||||
Transformer transformer = TransformerFactory.newInstance().newTransformer();
|
||||
StreamResult xmlOutput = new StreamResult();
|
||||
transformer.setOutputProperty(OutputKeys.OMIT_XML_DECLARATION, "yes");
|
||||
for (Node node : nodes) {
|
||||
if (node.getNodeType() == Node.ATTRIBUTE_NODE || node.getNodeType() == Node.TEXT_NODE) {
|
||||
results.add(node.getTextContent());
|
||||
} else {
|
||||
xmlOutput.setWriter(new StringWriter());
|
||||
transformer.transform(new DOMSource(node), xmlOutput);
|
||||
results.add(xmlOutput.getWriter().toString());
|
||||
}
|
||||
}
|
||||
return results;
|
||||
}
|
||||
|
||||
}
|
@ -0,0 +1,32 @@
|
||||
package us.codecraft.webmagic.selector;
|
||||
|
||||
import org.w3c.dom.Node;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* Selector(extractor) for html node.<br>
|
||||
*
|
||||
* @author hooy <br>
|
||||
* @since 0.8.0
|
||||
*/
|
||||
public interface NodeSelector {
|
||||
|
||||
/**
|
||||
* Extract single result in text.<br>
|
||||
* If there are more than one result, only the first will be chosen.
|
||||
*
|
||||
* @param node node
|
||||
* @return result
|
||||
*/
|
||||
String select(Node node);
|
||||
|
||||
/**
|
||||
* Extract all results in text.<br>
|
||||
*
|
||||
* @param node node
|
||||
* @return results
|
||||
*/
|
||||
List<String> selectList(Node node);
|
||||
|
||||
}
|
File diff suppressed because one or more lines are too long
Loading…
Reference in New Issue