diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/CssSelector.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/CssSelector.java index 0206b35d..5031077c 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/CssSelector.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/CssSelector.java @@ -43,6 +43,8 @@ public class CssSelector implements Selector { private String getValue(Element element) { if (attrName == null) { return element.outerHtml(); + } else if ("innerHtml".equalsIgnoreCase(attrName)) { + return element.html(); } else { return element.attr(attrName); } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/OrSelector.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/OrSelector.java index fd16dcb0..eee86b26 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/OrSelector.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/OrSelector.java @@ -26,9 +26,9 @@ public class OrSelector implements Selector { @Override public String select(String text) { for (Selector selector : selectors) { - text = selector.select(text); - if (text != null) { - return text; + String result = selector.select(text); + if (result != null) { + return result; } } return null; diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/RegexSelector.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/RegexSelector.java index 03967f31..6b1db967 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/RegexSelector.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/RegexSelector.java @@ -20,7 +20,9 @@ public class RegexSelector implements Selector { private Pattern regex; - public RegexSelector(String regexStr) { + private int group = 1; + + public RegexSelector(String regexStr, int group) { if (StringUtils.isBlank(regexStr)) { throw new IllegalArgumentException("regex must not be empty"); } @@ -36,11 +38,16 @@ public class RegexSelector implements Selector { } catch (PatternSyntaxException e) { throw new IllegalArgumentException("invalid regex", e); } + this.group = group; + } + + public RegexSelector(String regexStr) { + this(regexStr, 1); } @Override public String select(String text) { - return selectGroup(text).get(1); + return selectGroup(text).get(group); } @Override @@ -48,7 +55,7 @@ public class RegexSelector implements Selector { List strings = new ArrayList(); List results = selectGroupList(text); for (RegexResult result : results) { - strings.add(result.get(1)); + strings.add(result.get(group)); } return strings; } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/SelectorFactory.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/SelectorFactory.java index cbd3c225..8a0c76c9 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/SelectorFactory.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/SelectorFactory.java @@ -27,7 +27,11 @@ public class SelectorFactory { } public RegexSelector newRegexSelector(String regex, int group) { - return newSelector(RegexSelector.class, regex, String.valueOf(group)); + String cacheKey = getCacheKey(RegexSelector.class, regex, String.valueOf(group)); + if (innerCache.get(cacheKey) != null) { + return (RegexSelector) innerCache.get(cacheKey); + } + return new RegexSelector(regex, group); } public ReplaceSelector newReplaceSelector(String regex, String replacement) { diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectors.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectors.java new file mode 100644 index 00000000..b52d1287 --- /dev/null +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectors.java @@ -0,0 +1,44 @@ +package us.codecraft.webmagic.selector; + +/** + * Convenient methods for selectors.
+ * + * @author code4crafter@gmail.com
+ * @since 0.2.1 + */ +public abstract class Selectors { + + public static RegexSelector regex(String expr) { + return SelectorFactory.getInstatnce().newRegexSelector(expr); + } + + public static RegexSelector regex(String expr, int group) { + return SelectorFactory.getInstatnce().newRegexSelector(expr, group); + } + + public static CssSelector $(String expr) { + return new CssSelector(expr); + } + + public static CssSelector $(String expr, String attrName) { + return new CssSelector(expr, attrName); + } + + public static XpathSelector xpath(String expr) { + return SelectorFactory.getInstatnce().newXpathSelector(expr); + } + + public static AndSelector and(Selector... selectors) { + return new AndSelector(selectors); + } + + public static OrSelector or(Selector... selectors) { + return new OrSelector(selectors); + } + + public static void main(String[] args) { + String s = "a"; + or(regex("(.*)"), xpath("//title"), $("title")).select(s); + } + +} \ No newline at end of file diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/selector/ExtractorsTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/selector/ExtractorsTest.java new file mode 100644 index 00000000..b3980072 --- /dev/null +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/selector/ExtractorsTest.java @@ -0,0 +1,34 @@ +package us.codecraft.webmagic.selector; + +import junit.framework.Assert; +import org.junit.Test; + +import static us.codecraft.webmagic.selector.Selectors.*; + +/** + * @author code4crafter@gmail.com
+ */ +public class ExtractorsTest { + + String html = "

testaabbcc

"; + + String html2 = "aabbcc"; + + @Test + public void testEach() { + Assert.assertEquals("aabbcc", $("div h1 a").select(html)); + Assert.assertEquals("xxx", $("div h1 a", "href").select(html)); + Assert.assertEquals("aabbcc", $("div h1 a", "innerHtml").select(html)); + Assert.assertEquals("xxx", xpath("//a/@href").select(html)); + Assert.assertEquals("xxx", regex("a href=\"(.*)\"").select(html)); + Assert.assertEquals("xxx", regex("(a href)=\"(.*)\"", 2).select(html)); + } + + @Test + public void testCombo() { + Assert.assertEquals("bb", and($("title"), regex("aa(bb)cc")).select(html2)); + OrSelector or = or($("div h1 a", "innerHtml"), xpath("//title")); + Assert.assertEquals("aabbcc", or.select(html)); + Assert.assertEquals("aabbcc", or.select(html2)); + } +} diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/selector/RegexSelectorTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/selector/RegexSelectorTest.java index 849a4d6e..21660a19 100644 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/selector/RegexSelectorTest.java +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/selector/RegexSelectorTest.java @@ -5,8 +5,6 @@ import org.junit.Test; /** * @author code4crafter@gmail.com
- * Date: 13-4-21 - * Time: 上午7:13 */ public class RegexSelectorTest {