newselectors

pull/23/merge
yihua.huang 12 years ago
parent b1cba78bd6
commit 55d4a76ab7

@ -25,6 +25,12 @@
<artifactId>commons-lang3</artifactId> <artifactId>commons-lang3</artifactId>
</dependency> </dependency>
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>xsoup</artifactId>
<version>0.0.1-SNAPSHOT</version>
</dependency>
<dependency> <dependency>
<groupId>log4j</groupId> <groupId>log4j</groupId>
<artifactId>log4j</artifactId> <artifactId>log4j</artifactId>

@ -0,0 +1,23 @@
package us.codecraft.webmagic.selector;
import org.jsoup.Jsoup;
import java.util.List;
/**
* @author code4crafter@gmail.com
* @since 0.2.2
*/
public abstract class BaseElementSelector implements Selector,ElementSelector {
@Override
public String select(String text) {
return select(Jsoup.parse(text));
}
@Override
public List<String> selectList(String text) {
return selectList(Jsoup.parse(text));
}
}

@ -1,8 +1,6 @@
package us.codecraft.webmagic.selector; package us.codecraft.webmagic.selector;
import org.apache.commons.collections.CollectionUtils; import org.apache.commons.collections.CollectionUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element; import org.jsoup.nodes.Element;
import org.jsoup.select.Elements; import org.jsoup.select.Elements;
@ -15,7 +13,7 @@ import java.util.List;
* @author code4crafter@gmail.com <br> * @author code4crafter@gmail.com <br>
* @since 0.1.0 * @since 0.1.0
*/ */
public class CssSelector implements Selector { public class CssSelector extends BaseElementSelector {
private String selectorText; private String selectorText;
@ -30,16 +28,6 @@ public class CssSelector implements Selector {
this.attrName = attrName; this.attrName = attrName;
} }
@Override
public String select(String text) {
Document doc = Jsoup.parse(text);
Elements elements = doc.select(selectorText);
if (CollectionUtils.isEmpty(elements)) {
return null;
}
return getValue(elements.get(0));
}
private String getValue(Element element) { private String getValue(Element element) {
if (attrName == null) { if (attrName == null) {
return element.outerHtml(); return element.outerHtml();
@ -51,9 +39,17 @@ public class CssSelector implements Selector {
} }
@Override @Override
public List<String> selectList(String text) { public String select(Element element) {
Elements elements = element.select(selectorText);
if (CollectionUtils.isEmpty(elements)) {
return null;
}
return getValue(elements.get(0));
}
@Override
public List<String> selectList(Element doc) {
List<String> strings = new ArrayList<String>(); List<String> strings = new ArrayList<String>();
Document doc = Jsoup.parse(text);
Elements elements = doc.select(selectorText); Elements elements = doc.select(selectorText);
if (CollectionUtils.isNotEmpty(elements)) { if (CollectionUtils.isNotEmpty(elements)) {
for (Element element : elements) { for (Element element : elements) {

@ -0,0 +1,32 @@
package us.codecraft.webmagic.selector;
import org.jsoup.nodes.Element;
import java.util.List;
/**
* Selector(extractor) for html elements.<br>
*
* @author code4crafter@gmail.com <br>
* @since 0.2.2
*/
public interface ElementSelector {
/**
* Extract single result in text.<br>
* If there are more than one result, only the first will be chosen.
*
* @param element
* @return result
*/
public String select(Element element);
/**
* Extract all results in text.<br>
*
* @param element
* @return results
*/
public List<String> selectList(Element element);
}

@ -0,0 +1,32 @@
package us.codecraft.webmagic.selector;
import org.jsoup.nodes.Element;
import us.codecraft.xsoup.XPathEvaluator;
import us.codecraft.xsoup.Xsoup;
import java.util.List;
/**
* XPath selector based on Xsoup.<br>
*
* @author code4crafter@gmail.com <br>
* @since 0.2.2
*/
public class XsoupSelector extends BaseElementSelector {
private XPathEvaluator xPathEvaluator;
public XsoupSelector(String xpathStr) {
this.xPathEvaluator = Xsoup.compile(xpathStr);
}
@Override
public String select(Element element) {
return xPathEvaluator.evaluate(element).get();
}
@Override
public List<String> selectList(Element element) {
return xPathEvaluator.evaluate(element).list();
}
}
Loading…
Cancel
Save