From 17f8ead28fa65b402f9bb7c0ae76d1c514076a5b Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Sat, 17 Aug 2013 21:33:54 +0800 Subject: [PATCH] update comments for selector --- .../us/codecraft/webmagic/selector/Html.java | 8 +++---- .../webmagic/selector/RegexResult.java | 6 ++--- .../webmagic/selector/RegexSelector.java | 22 +++++++++---------- .../webmagic/selector/ReplaceSelector.java | 6 ++--- .../webmagic/selector/Selectable.java | 6 ++--- .../codecraft/webmagic/selector/Selector.java | 15 ++++++++++--- .../webmagic/selector/SelectorFactory.java | 8 +++---- .../selector/SmartContentSelector.java | 10 +++++---- .../webmagic/selector/XpathSelector.java | 6 ++--- .../codecraft/webmagic/selector/package.html | 2 +- 10 files changed, 50 insertions(+), 39 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java index 114eef99..5a0c6cde 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java @@ -4,10 +4,10 @@ import java.util.ArrayList; import java.util.List; /** - * 可抽取的html文本。
+ * Selectable plain text.
+ * * @author code4crafter@gmail.com
- * Date: 13-4-21 - * Time: 上午7:54 + * @since 0.1.0 */ public class Html extends PlainText { @@ -66,7 +66,7 @@ public class Html extends PlainText { @Override public Selectable $(String selector) { CssSelector cssSelector = new CssSelector(selector); - return selectList(cssSelector,strings); + return selectList(cssSelector, strings); } } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/RegexResult.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/RegexResult.java index 04467bcc..5876576b 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/RegexResult.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/RegexResult.java @@ -1,10 +1,10 @@ package us.codecraft.webmagic.selector; /** - * 封装正则表达式抽取接口的类。
+ * Object contains regex results.
+ * * @author code4crafter@gmail.com
- * Date: 13-4-21 - * Time: 上午7:39 + * @since 0.1.0 */ class RegexResult { diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/RegexSelector.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/RegexSelector.java index e95138b7..03967f31 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/RegexSelector.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/RegexSelector.java @@ -9,10 +9,10 @@ import java.util.regex.Pattern; import java.util.regex.PatternSyntaxException; /** - * 正则表达式抽取器。
+ * Selector in regex.
+ * * @author code4crafter@gmail.com
- * Date: 13-4-21 - * Time: 上午7:09 + * @since 0.1.0 */ public class RegexSelector implements Selector { @@ -21,18 +21,18 @@ public class RegexSelector implements Selector { private Pattern regex; public RegexSelector(String regexStr) { - if (StringUtils.isBlank(regexStr)){ + if (StringUtils.isBlank(regexStr)) { throw new IllegalArgumentException("regex must not be empty"); } - if (!StringUtils.contains(regexStr,"(")&&!StringUtils.contains(regexStr,")")){ - regexStr="("+regexStr+")"; + if (!StringUtils.contains(regexStr, "(") && !StringUtils.contains(regexStr, ")")) { + regexStr = "(" + regexStr + ")"; } - if (!StringUtils.contains(regexStr,"(")||!StringUtils.contains(regexStr,")")){ + if (!StringUtils.contains(regexStr, "(") || !StringUtils.contains(regexStr, ")")) { throw new IllegalArgumentException("regex must have capture group 1"); } this.regexStr = regexStr; try { - regex = Pattern.compile(regexStr,Pattern.DOTALL|Pattern.CASE_INSENSITIVE); + regex = Pattern.compile(regexStr, Pattern.DOTALL | Pattern.CASE_INSENSITIVE); } catch (PatternSyntaxException e) { throw new IllegalArgumentException("invalid regex", e); } @@ -45,7 +45,7 @@ public class RegexSelector implements Selector { @Override public List selectList(String text) { - List strings=new ArrayList(); + List strings = new ArrayList(); List results = selectGroupList(text); for (RegexResult result : results) { strings.add(result.get(1)); @@ -56,7 +56,7 @@ public class RegexSelector implements Selector { public RegexResult selectGroup(String text) { Matcher matcher = regex.matcher(text); if (matcher.find()) { - String[] groups = new String[matcher.groupCount()+1]; + String[] groups = new String[matcher.groupCount() + 1]; for (int i = 0; i < groups.length; i++) { groups[i] = matcher.group(i); } @@ -69,7 +69,7 @@ public class RegexSelector implements Selector { Matcher matcher = regex.matcher(text); List resultList = new ArrayList(); while (matcher.find()) { - String[] groups = new String[matcher.groupCount()+1]; + String[] groups = new String[matcher.groupCount() + 1]; for (int i = 0; i < groups.length; i++) { groups[i] = matcher.group(i); } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/ReplaceSelector.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/ReplaceSelector.java index 38b95f78..567dcda3 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/ReplaceSelector.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/ReplaceSelector.java @@ -6,10 +6,10 @@ import java.util.regex.Pattern; import java.util.regex.PatternSyntaxException; /** - * 对文本进行替换。
+ * Replace selector。
+ * * @author code4crafter@gmail.com
- * Date: 13-4-21 - * Time: 上午7:09 + * @since 0.1.0 */ public class ReplaceSelector implements Selector { diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectable.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectable.java index 42f3d108..65878ece 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectable.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectable.java @@ -3,10 +3,10 @@ package us.codecraft.webmagic.selector; import java.util.List; /** - * 可进行抽取的文本。
+ * Selectable text.
+ * * @author code4crafter@gmail.com
- * Date: 13-4-20 - * Time: 下午7:51 + * @since 0.1.0 */ public interface Selectable { diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selector.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selector.java index 4af2b449..06756c44 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selector.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selector.java @@ -3,15 +3,24 @@ package us.codecraft.webmagic.selector; import java.util.List; /** - * 抽取器。
+ * Selector(extractor) for text.
* @author code4crafter@gmail.com
- * Date: 13-4-20 - * Time: 下午8:02 */ public interface Selector { + /** + * Extract single result in text.
+ * If there are more than one result, only the first will be chosen. + * @param text + * @return result + */ public String select(String text); + /** + * Extract all results in text.
+ * @param text + * @return results + */ public List selectList(String text); } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/SelectorFactory.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/SelectorFactory.java index 1dd56e01..c8a3879c 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/SelectorFactory.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/SelectorFactory.java @@ -7,10 +7,10 @@ import java.util.Map; import java.util.concurrent.ConcurrentHashMap; /** - * 产生selector的工厂。
+ * Selector factory with some inner cache.
+ * * @author code4crafter@gmail.com
- * Date: 13-4-21 - * Time: 上午7:56 + * @since 0.1.0 */ public class SelectorFactory { @@ -34,7 +34,7 @@ public class SelectorFactory { return newSelector(XpathSelector.class, xpath); } - public SmartContentSelector newSmartContentSelector(){ + public SmartContentSelector newSmartContentSelector() { return newSelector(SmartContentSelector.class); } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/SmartContentSelector.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/SmartContentSelector.java index 89748975..efd4e11a 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/SmartContentSelector.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/SmartContentSelector.java @@ -3,17 +3,19 @@ package us.codecraft.webmagic.selector; import org.apache.log4j.Logger; import org.htmlcleaner.HtmlCleaner; import org.htmlcleaner.TagNode; +import us.codecraft.webmagic.utils.Experimental; import java.util.*; import java.util.concurrent.atomic.AtomicInteger; /** - * readability算法,基础是找到所有p标签的父节点 - * 写的比较乱,最终效果还在尝试中 + * Extract the text content of html.
+ * Using Readability algorithm: find parents of all p tags. + * * @author code4crafter@gmail.com
- * Date: 13-4-21 - * Time: 下午4:42 + * @since 0.1.0 */ +@Experimental public class SmartContentSelector implements Selector { private Logger logger = Logger.getLogger(getClass()); diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/XpathSelector.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/XpathSelector.java index 02afe291..a2eab3d9 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/XpathSelector.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/XpathSelector.java @@ -6,10 +6,10 @@ import java.util.ArrayList; import java.util.List; /** - * xpath的选择器。包装了HtmlCleaner。
+ * XPath selector based on HtmlCleaner。
+ * * @author code4crafter@gmail.com
- * Date: 13-4-21 - * Time: 上午9:39 + * @since 0.1.0 */ public class XpathSelector implements Selector { diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/package.html b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/package.html index 3c9ef7b2..eb01c3f7 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/package.html +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/package.html @@ -1,5 +1,5 @@ -提供了便捷抽取页面内容的工具,对外核心接口是Selectable,内部抽取则是通过实现Selector来定制。 +Selectors for page extraction. Core API is the interface Selectable,and internal core is the interface Selector。