extractors

pull/17/head
yihua.huang 12 years ago
parent 20705b34ac
commit c1471718df

@ -43,6 +43,8 @@ public class CssSelector implements Selector {
private String getValue(Element element) {
if (attrName == null) {
return element.outerHtml();
} else if ("innerHtml".equalsIgnoreCase(attrName)) {
return element.html();
} else {
return element.attr(attrName);
}

@ -26,9 +26,9 @@ public class OrSelector implements Selector {
@Override
public String select(String text) {
for (Selector selector : selectors) {
text = selector.select(text);
if (text != null) {
return text;
String result = selector.select(text);
if (result != null) {
return result;
}
}
return null;

@ -20,7 +20,9 @@ public class RegexSelector implements Selector {
private Pattern regex;
public RegexSelector(String regexStr) {
private int group = 1;
public RegexSelector(String regexStr, int group) {
if (StringUtils.isBlank(regexStr)) {
throw new IllegalArgumentException("regex must not be empty");
}
@ -36,11 +38,16 @@ public class RegexSelector implements Selector {
} catch (PatternSyntaxException e) {
throw new IllegalArgumentException("invalid regex", e);
}
this.group = group;
}
public RegexSelector(String regexStr) {
this(regexStr, 1);
}
@Override
public String select(String text) {
return selectGroup(text).get(1);
return selectGroup(text).get(group);
}
@Override
@ -48,7 +55,7 @@ public class RegexSelector implements Selector {
List<String> strings = new ArrayList<String>();
List<RegexResult> results = selectGroupList(text);
for (RegexResult result : results) {
strings.add(result.get(1));
strings.add(result.get(group));
}
return strings;
}

@ -27,7 +27,11 @@ public class SelectorFactory {
}
public RegexSelector newRegexSelector(String regex, int group) {
return newSelector(RegexSelector.class, regex, String.valueOf(group));
String cacheKey = getCacheKey(RegexSelector.class, regex, String.valueOf(group));
if (innerCache.get(cacheKey) != null) {
return (RegexSelector) innerCache.get(cacheKey);
}
return new RegexSelector(regex, group);
}
public ReplaceSelector newReplaceSelector(String regex, String replacement) {

@ -0,0 +1,44 @@
package us.codecraft.webmagic.selector;
/**
* Convenient methods for selectors.<br>
*
* @author code4crafter@gmail.com <br>
* @since 0.2.1
*/
public abstract class Selectors {
public static RegexSelector regex(String expr) {
return SelectorFactory.getInstatnce().newRegexSelector(expr);
}
public static RegexSelector regex(String expr, int group) {
return SelectorFactory.getInstatnce().newRegexSelector(expr, group);
}
public static CssSelector $(String expr) {
return new CssSelector(expr);
}
public static CssSelector $(String expr, String attrName) {
return new CssSelector(expr, attrName);
}
public static XpathSelector xpath(String expr) {
return SelectorFactory.getInstatnce().newXpathSelector(expr);
}
public static AndSelector and(Selector... selectors) {
return new AndSelector(selectors);
}
public static OrSelector or(Selector... selectors) {
return new OrSelector(selectors);
}
public static void main(String[] args) {
String s = "a";
or(regex("<title>(.*)</title>"), xpath("//title"), $("title")).select(s);
}
}

@ -0,0 +1,34 @@
package us.codecraft.webmagic.selector;
import junit.framework.Assert;
import org.junit.Test;
import static us.codecraft.webmagic.selector.Selectors.*;
/**
* @author code4crafter@gmail.com <br>
*/
public class ExtractorsTest {
String html = "<div><h1>test<a href=\"xxx\">aabbcc</a></h1></div>";
String html2 = "<title>aabbcc</title>";
@Test
public void testEach() {
Assert.assertEquals("<a href=\"xxx\">aabbcc</a>", $("div h1 a").select(html));
Assert.assertEquals("xxx", $("div h1 a", "href").select(html));
Assert.assertEquals("aabbcc", $("div h1 a", "innerHtml").select(html));
Assert.assertEquals("xxx", xpath("//a/@href").select(html));
Assert.assertEquals("xxx", regex("a href=\"(.*)\"").select(html));
Assert.assertEquals("xxx", regex("(a href)=\"(.*)\"", 2).select(html));
}
@Test
public void testCombo() {
Assert.assertEquals("bb", and($("title"), regex("aa(bb)cc")).select(html2));
OrSelector or = or($("div h1 a", "innerHtml"), xpath("//title"));
Assert.assertEquals("aabbcc", or.select(html));
Assert.assertEquals("aabbcc", or.select(html2));
}
}

@ -5,8 +5,6 @@ import org.junit.Test;
/**
* @author code4crafter@gmail.com <br>
* Date: 13-4-21
* Time: 7:13
*/
public class RegexSelectorTest {

Loading…
Cancel
Save