diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/CssSelector.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/CssSelector.java index ab391863..0206b35d 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/CssSelector.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/CssSelector.java @@ -19,10 +19,17 @@ public class CssSelector implements Selector { private String selectorText; + private String attrName; + public CssSelector(String selectorText) { this.selectorText = selectorText; } + public CssSelector(String selectorText, String attrName) { + this.selectorText = selectorText; + this.attrName = attrName; + } + @Override public String select(String text) { Document doc = Jsoup.parse(text); @@ -30,7 +37,15 @@ public class CssSelector implements Selector { if (CollectionUtils.isEmpty(elements)) { return null; } - return elements.get(0).outerHtml(); + return getValue(elements.get(0)); + } + + private String getValue(Element element) { + if (attrName == null) { + return element.outerHtml(); + } else { + return element.attr(attrName); + } } @Override @@ -40,7 +55,10 @@ public class CssSelector implements Selector { Elements elements = doc.select(selectorText); if (CollectionUtils.isNotEmpty(elements)) { for (Element element : elements) { - strings.add(element.outerHtml()); + String value = getValue(element); + if (value != null) { + strings.add(value); + } } } return strings; diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java index 5a0c6cde..1d5e8c59 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java @@ -69,4 +69,10 @@ public class Html extends PlainText { return selectList(cssSelector, strings); } + @Override + public Selectable $(String selector, String attrName) { + CssSelector cssSelector = new CssSelector(selector, attrName); + return selectList(cssSelector, strings); + } + } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/PlainText.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/PlainText.java index 6fabd998..e0501eb9 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/PlainText.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/PlainText.java @@ -40,6 +40,11 @@ public class PlainText implements Selectable { throw new UnsupportedOperationException(); } + @Override + public Selectable $(String selector, String attrName) { + throw new UnsupportedOperationException(); + } + @Override public Selectable smartContent() { throw new UnsupportedOperationException(); @@ -56,6 +61,12 @@ public class PlainText implements Selectable { return selectList(regexSelector, strings); } + @Override + public Selectable regex(String regex, int group) { + RegexSelector regexSelector = SelectorFactory.getInstatnce().newRegexSelector(regex, group); + return selectList(regexSelector, strings); + } + protected Selectable select(Selector selector, List strings) { List results = new ArrayList(); for (String string : strings) { diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectable.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectable.java index 65878ece..21c93817 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectable.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectable.java @@ -26,6 +26,15 @@ public interface Selectable { */ public Selectable $(String selector); + /** + * select list with css selector + * + * @param selector css selector expression + * @param attrName attribute name of css selector + * @return new Selectable after extract + */ + public Selectable $(String selector, String attrName); + /** * select smart content with ReadAbility algorithm * @@ -41,13 +50,22 @@ public interface Selectable { public Selectable links(); /** - * select list with regex + * select list with regex, default group is group 1 * * @param regex * @return new Selectable after extract */ public Selectable regex(String regex); + /** + * select list with regex + * + * @param regex + * @param group + * @return new Selectable after extract + */ + public Selectable regex(String regex, int group); + /** * replace with regex * diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/SelectorFactory.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/SelectorFactory.java index c8a3879c..cbd3c225 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/SelectorFactory.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/SelectorFactory.java @@ -26,6 +26,10 @@ public class SelectorFactory { return newSelector(RegexSelector.class, regex); } + public RegexSelector newRegexSelector(String regex, int group) { + return newSelector(RegexSelector.class, regex, String.valueOf(group)); + } + public ReplaceSelector newReplaceSelector(String regex, String replacement) { return newSelector(ReplaceSelector.class, regex, replacement); }