diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/PlainText.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/PlainText.java index ca40fac0..efa38d8e 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/PlainText.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/PlainText.java @@ -122,6 +122,16 @@ public class PlainText implements Selectable { } } + @Override + public Selectable select(Selector selector) { + return select(selector, strings); + } + + @Override + public Selectable selectList(Selector selector) { + return selectList(selector, strings); + } + @Override public String toString() { return get(); diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectable.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectable.java index cdab8bff..2cc4ed96 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectable.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectable.java @@ -128,4 +128,19 @@ public interface Selectable { */ public Selectable jsonPath(String jsonPath); + /** + * extract by custom selector + * + * @param selector + * @return + */ + public Selectable select(Selector selector); + + /** + * extract by custom selector + * + * @param selector + * @return + */ + public Selectable selectList(Selector selector); } diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/ModelPageProcessor.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/ModelPageProcessor.java index 3a97e1d3..6bfe88d7 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/ModelPageProcessor.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/ModelPageProcessor.java @@ -7,9 +7,7 @@ import us.codecraft.webmagic.processor.PageProcessor; import us.codecraft.webmagic.selector.Selector; import java.util.ArrayList; -import java.util.HashSet; import java.util.List; -import java.util.Set; import java.util.regex.Matcher; import java.util.regex.Pattern; @@ -66,7 +64,7 @@ class ModelPageProcessor implements PageProcessor { if (urlRegionSelector == null) { links = page.getHtml().links().all(); } else { - links = urlRegionSelector.selectList(page.getHtml().toString()); + links = page.getHtml().selectList(urlRegionSelector).links().all(); } for (String link : links) { for (Pattern targetUrlPattern : urlPatterns) {