diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java index a4ea0d37..06987d86 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java @@ -1,5 +1,8 @@ package us.codecraft.webmagic.selector; +import org.jsoup.Jsoup; +import org.jsoup.nodes.Document; + import java.util.ArrayList; import java.util.List; @@ -11,12 +14,23 @@ import java.util.List; */ public class Html extends PlainText { + /** + * Store parsed document for better performance when only one text exist. + */ + private Document document; + public Html(List strings) { super(strings); } public Html(String text) { super(text); + this.document = Jsoup.parse(text); + } + + public Html(Document document) { + super(document.html()); + this.document = document; } public static Html create(String text) { @@ -53,38 +67,34 @@ public class Html extends PlainText { @Override public Selectable links() { - XsoupSelector xpathSelector = new XsoupSelector("//a/@href"); - return selectList(xpathSelector, strings); + return xpath("//a/@href"); } @Override public Selectable xpath(String xpath) { - XsoupSelector xpathSelector = new XsoupSelector(xpath); - return selectList(xpathSelector, strings); + XsoupSelector xsoupSelector = new XsoupSelector(xpath); + if (document!=null){ + return new Html(xsoupSelector.selectList(document)); + } + return selectList(xsoupSelector, strings); } @Override public Selectable $(String selector) { CssSelector cssSelector = Selectors.$(selector); + if (document!=null){ + return new Html(cssSelector.selectList(document)); + } return selectList(cssSelector, strings); } @Override public Selectable $(String selector, String attrName) { CssSelector cssSelector = Selectors.$(selector, attrName); + if (document!=null){ + return new Html(cssSelector.selectList(document)); + } return selectList(cssSelector, strings); } - @Override - public Selectable text() { - TextContentSelector selector = Selectors.text(); - return select(selector, strings); - } - - @Override - public Selectable text(String newlineSeparator) { - TextContentSelector selector = Selectors.text(newlineSeparator); - return select(selector, strings); - } - } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/PlainText.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/PlainText.java index df6926dd..9406f3ab 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/PlainText.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/PlainText.java @@ -89,7 +89,7 @@ public class PlainText implements Selectable { @Override public Selectable replace(String regex, String replacement) { - ReplaceSelector replaceSelector = SelectorFactory.getInstatnce().newReplaceSelector(regex, replacement); + ReplaceSelector replaceSelector = new ReplaceSelector(regex,replacement); return select(replaceSelector, strings); } @@ -107,18 +107,6 @@ public class PlainText implements Selectable { } } - @Override - public Selectable text() { - //do nothing - return this; - } - - @Override - public Selectable text(String newlineSeparator) { - //do nothing - return this; - } - @Override public boolean match() { return strings != null && strings.size() > 0; diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectable.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectable.java index 398906fa..66df5d5b 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectable.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectable.java @@ -82,20 +82,6 @@ public interface Selectable { */ public String toString(); - /** - * select text content of html - * - * @return text - */ - public Selectable text(); - - /** - * select text content of html - * - * @return text - */ - public Selectable text(String newlineSeparator); - /** * if result exist for select * diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/SelectorFactory.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/SelectorFactory.java deleted file mode 100644 index 8a0c76c9..00000000 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/SelectorFactory.java +++ /dev/null @@ -1,91 +0,0 @@ -package us.codecraft.webmagic.selector; - -import org.apache.commons.lang3.StringUtils; - -import java.lang.reflect.Constructor; -import java.util.Map; -import java.util.concurrent.ConcurrentHashMap; - -/** - * Selector factory with some inner cache.
- * - * @author code4crafter@gmail.com
- * @since 0.1.0 - */ -public class SelectorFactory { - - private Map innerCache = new ConcurrentHashMap(); - - private static final SelectorFactory INSTATNCE = new SelectorFactory(); - - public static SelectorFactory getInstatnce() { - return INSTATNCE; - } - - public RegexSelector newRegexSelector(String regex) { - return newSelector(RegexSelector.class, regex); - } - - public RegexSelector newRegexSelector(String regex, int group) { - String cacheKey = getCacheKey(RegexSelector.class, regex, String.valueOf(group)); - if (innerCache.get(cacheKey) != null) { - return (RegexSelector) innerCache.get(cacheKey); - } - return new RegexSelector(regex, group); - } - - public ReplaceSelector newReplaceSelector(String regex, String replacement) { - return newSelector(ReplaceSelector.class, regex, replacement); - } - - public XpathSelector newXpathSelector(String xpath) { - return newSelector(XpathSelector.class, xpath); - } - - public SmartContentSelector newSmartContentSelector() { - return newSelector(SmartContentSelector.class); - } - - public T newAndCacheSelector(Class clazz, String... param) { - String cacheKey = getCacheKey(RegexSelector.class, param); - if (innerCache.get(cacheKey) != null) { - return (T) innerCache.get(cacheKey); - } - T selector = newSelector(clazz, param); - if (selector != null) { - innerCache.put(cacheKey, selector); - } - return selector; - - } - - public T newSelector(Class clazz, String... param) { - try { - if (param.length == 0) { - Constructor constructor - = clazz.getConstructor(); - T selector = constructor.newInstance(); - return selector; - } else if (param.length == 1) { - Constructor constructor - = clazz.getConstructor(String.class); - T selector = constructor.newInstance(param[0]); - return selector; - } else if (param.length == 2) { - Constructor constructor - = clazz.getConstructor(String.class, String.class); - T selector = constructor.newInstance(param[0], param[1]); - return selector; - } else { - throw new UnsupportedOperationException(); - } - } catch (Exception e) { - throw new IllegalArgumentException("init object error", e); - } - } - - private String getCacheKey(Class clazz, String... param) { - return clazz.toString() + "_" + StringUtils.join(param, "_"); - } - -} diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectors.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectors.java index 051d6a43..9764641c 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectors.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectors.java @@ -9,15 +9,15 @@ package us.codecraft.webmagic.selector; public abstract class Selectors { public static RegexSelector regex(String expr) { - return SelectorFactory.getInstatnce().newRegexSelector(expr); + return new RegexSelector(expr); } public static RegexSelector regex(String expr, int group) { - return SelectorFactory.getInstatnce().newRegexSelector(expr, group); + return new RegexSelector(expr,group); } public static SmartContentSelector smartContent() { - return SelectorFactory.getInstatnce().newSmartContentSelector(); + return new SmartContentSelector(); } public static CssSelector $(String expr) { @@ -29,7 +29,11 @@ public abstract class Selectors { } public static XpathSelector xpath(String expr) { - return SelectorFactory.getInstatnce().newXpathSelector(expr); + return new XpathSelector(expr); + } + + public static XsoupSelector xsoup(String expr) { + return new XsoupSelector(expr); } public static AndSelector and(Selector... selectors) { @@ -40,14 +44,6 @@ public abstract class Selectors { return new OrSelector(selectors); } - public static TextContentSelector text() { - return new TextContentSelector(); - } - - public static TextContentSelector text(String newlineSeperator) { - return new TextContentSelector(newlineSeperator); - } - public static void main(String[] args) { String s = "a"; or(regex("(.*)"), xpath("//title"), $("title")).select(s); diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/TextContentSelector.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/TextContentSelector.java deleted file mode 100644 index 54e82042..00000000 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/TextContentSelector.java +++ /dev/null @@ -1,68 +0,0 @@ -package us.codecraft.webmagic.selector; - -import org.jsoup.Jsoup; -import org.jsoup.nodes.Document; -import org.jsoup.nodes.Element; - -import java.util.Arrays; -import java.util.HashSet; -import java.util.List; -import java.util.Set; - -/** - * Extract text content in html.
- * Algorithm from http://www.elias.cn/En/ExtMainText.
- * - * @author code4crafter@gmail.com
- * @since 0.2.2 - */ -public class TextContentSelector implements Selector { - - private String newLineSeperator = "\n"; - - public TextContentSelector() { - } - - public TextContentSelector(String newLineSeperator) { - this.newLineSeperator = newLineSeperator; - } - - private final static Set TAGS_IN_NEWLINE = new HashSet(); - - private final static Set TAGS_TO_IGNORE = new HashSet(); - - static { - TAGS_IN_NEWLINE.addAll(Arrays.asList(new String[]{"p", "div", "h1", "h2", "h3", "h4", "h5", "h6", "br", "li"})); - TAGS_TO_IGNORE.addAll(Arrays.asList(new String[]{"head", "style", "script", "noscript", "option"})); - } - - @Override - public String select(String text) { - Document doc = Jsoup.parse(text); - return select0(doc); - } - - protected String select0(Element element) { - String tagName = element.tagName().toLowerCase(); - if (TAGS_TO_IGNORE.contains(tagName)) { - return ""; - } - StringBuilder textBuilder = new StringBuilder(); - textBuilder.append(element.text()); - if (element.children() != null) { - for (Element child : element.children()) { - textBuilder.append(select0(child)); - } - } - if (TAGS_IN_NEWLINE.contains(tagName)) { - textBuilder.append(newLineSeperator); - } - return textBuilder.toString(); - } - - @Override - public List selectList(String text) { - throw new UnsupportedOperationException(); - } - -} diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/selector/TextContentSelectorTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/selector/TextContentSelectorTest.java deleted file mode 100644 index f5018249..00000000 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/selector/TextContentSelectorTest.java +++ /dev/null @@ -1,34 +0,0 @@ -package us.codecraft.webmagic.selector; - -import junit.framework.Assert; -import org.junit.Ignore; -import org.junit.Test; -import us.codecraft.webmagic.downloader.HttpClientDownloader; - -/** - * @author code4crafter@gmail.com
- * @since 0.2.2 - */ -public class TextContentSelectorTest { - - @Test - public void test() { - String html = "
\n" + - "
\n" + - "

Add more powerful selector for content text extract refered to http://www.elias.cn/En/ExtMainText

\n" + - "
\n" + - "
"; - TextContentSelector textContentSelector = new TextContentSelector("
"); - String text = textContentSelector.select(html); - Assert.assertNotNull(text); - } - - @Ignore("takes long time") - @Test - public void testDownload() { - String s = new HttpClientDownloader().download("http://blog.codecraft.us/blog/2013/08/18/ti-yan-dao-liao-open-sourcede-mei-li/", "utf-8") - .smartContent().text().toString(); - Assert.assertNotNull(s); - } - -}