diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java
index 114eef99..5a0c6cde 100644
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java
@@ -4,10 +4,10 @@ import java.util.ArrayList;
import java.util.List;
/**
- * 可抽取的html文本。
+ * Selectable plain text.
+ *
* @author code4crafter@gmail.com
- * Date: 13-4-21
- * Time: 上午7:54
+ * @since 0.1.0
*/
public class Html extends PlainText {
@@ -66,7 +66,7 @@ public class Html extends PlainText {
@Override
public Selectable $(String selector) {
CssSelector cssSelector = new CssSelector(selector);
- return selectList(cssSelector,strings);
+ return selectList(cssSelector, strings);
}
}
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/RegexResult.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/RegexResult.java
index 04467bcc..5876576b 100644
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/RegexResult.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/RegexResult.java
@@ -1,10 +1,10 @@
package us.codecraft.webmagic.selector;
/**
- * 封装正则表达式抽取接口的类。
+ * Object contains regex results.
+ *
* @author code4crafter@gmail.com
- * Date: 13-4-21
- * Time: 上午7:39
+ * @since 0.1.0
*/
class RegexResult {
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/RegexSelector.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/RegexSelector.java
index e95138b7..03967f31 100644
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/RegexSelector.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/RegexSelector.java
@@ -9,10 +9,10 @@ import java.util.regex.Pattern;
import java.util.regex.PatternSyntaxException;
/**
- * 正则表达式抽取器。
+ * Selector in regex.
+ *
* @author code4crafter@gmail.com
- * Date: 13-4-21
- * Time: 上午7:09
+ * @since 0.1.0
*/
public class RegexSelector implements Selector {
@@ -21,18 +21,18 @@ public class RegexSelector implements Selector {
private Pattern regex;
public RegexSelector(String regexStr) {
- if (StringUtils.isBlank(regexStr)){
+ if (StringUtils.isBlank(regexStr)) {
throw new IllegalArgumentException("regex must not be empty");
}
- if (!StringUtils.contains(regexStr,"(")&&!StringUtils.contains(regexStr,")")){
- regexStr="("+regexStr+")";
+ if (!StringUtils.contains(regexStr, "(") && !StringUtils.contains(regexStr, ")")) {
+ regexStr = "(" + regexStr + ")";
}
- if (!StringUtils.contains(regexStr,"(")||!StringUtils.contains(regexStr,")")){
+ if (!StringUtils.contains(regexStr, "(") || !StringUtils.contains(regexStr, ")")) {
throw new IllegalArgumentException("regex must have capture group 1");
}
this.regexStr = regexStr;
try {
- regex = Pattern.compile(regexStr,Pattern.DOTALL|Pattern.CASE_INSENSITIVE);
+ regex = Pattern.compile(regexStr, Pattern.DOTALL | Pattern.CASE_INSENSITIVE);
} catch (PatternSyntaxException e) {
throw new IllegalArgumentException("invalid regex", e);
}
@@ -45,7 +45,7 @@ public class RegexSelector implements Selector {
@Override
public List selectList(String text) {
- List strings=new ArrayList();
+ List strings = new ArrayList();
List results = selectGroupList(text);
for (RegexResult result : results) {
strings.add(result.get(1));
@@ -56,7 +56,7 @@ public class RegexSelector implements Selector {
public RegexResult selectGroup(String text) {
Matcher matcher = regex.matcher(text);
if (matcher.find()) {
- String[] groups = new String[matcher.groupCount()+1];
+ String[] groups = new String[matcher.groupCount() + 1];
for (int i = 0; i < groups.length; i++) {
groups[i] = matcher.group(i);
}
@@ -69,7 +69,7 @@ public class RegexSelector implements Selector {
Matcher matcher = regex.matcher(text);
List resultList = new ArrayList();
while (matcher.find()) {
- String[] groups = new String[matcher.groupCount()+1];
+ String[] groups = new String[matcher.groupCount() + 1];
for (int i = 0; i < groups.length; i++) {
groups[i] = matcher.group(i);
}
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/ReplaceSelector.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/ReplaceSelector.java
index 38b95f78..567dcda3 100644
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/ReplaceSelector.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/ReplaceSelector.java
@@ -6,10 +6,10 @@ import java.util.regex.Pattern;
import java.util.regex.PatternSyntaxException;
/**
- * 对文本进行替换。
+ * Replace selector。
+ *
* @author code4crafter@gmail.com
- * Date: 13-4-21
- * Time: 上午7:09
+ * @since 0.1.0
*/
public class ReplaceSelector implements Selector {
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectable.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectable.java
index 42f3d108..65878ece 100644
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectable.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectable.java
@@ -3,10 +3,10 @@ package us.codecraft.webmagic.selector;
import java.util.List;
/**
- * 可进行抽取的文本。
+ * Selectable text.
+ *
* @author code4crafter@gmail.com
- * Date: 13-4-20
- * Time: 下午7:51
+ * @since 0.1.0
*/
public interface Selectable {
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selector.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selector.java
index 4af2b449..06756c44 100644
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selector.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selector.java
@@ -3,15 +3,24 @@ package us.codecraft.webmagic.selector;
import java.util.List;
/**
- * 抽取器。
+ * Selector(extractor) for text.
* @author code4crafter@gmail.com
- * Date: 13-4-20
- * Time: 下午8:02
*/
public interface Selector {
+ /**
+ * Extract single result in text.
+ * If there are more than one result, only the first will be chosen.
+ * @param text
+ * @return result
+ */
public String select(String text);
+ /**
+ * Extract all results in text.
+ * @param text
+ * @return results
+ */
public List selectList(String text);
}
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/SelectorFactory.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/SelectorFactory.java
index 1dd56e01..c8a3879c 100644
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/SelectorFactory.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/SelectorFactory.java
@@ -7,10 +7,10 @@ import java.util.Map;
import java.util.concurrent.ConcurrentHashMap;
/**
- * 产生selector的工厂。
+ * Selector factory with some inner cache.
+ *
* @author code4crafter@gmail.com
- * Date: 13-4-21
- * Time: 上午7:56
+ * @since 0.1.0
*/
public class SelectorFactory {
@@ -34,7 +34,7 @@ public class SelectorFactory {
return newSelector(XpathSelector.class, xpath);
}
- public SmartContentSelector newSmartContentSelector(){
+ public SmartContentSelector newSmartContentSelector() {
return newSelector(SmartContentSelector.class);
}
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/SmartContentSelector.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/SmartContentSelector.java
index 89748975..efd4e11a 100644
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/SmartContentSelector.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/SmartContentSelector.java
@@ -3,17 +3,19 @@ package us.codecraft.webmagic.selector;
import org.apache.log4j.Logger;
import org.htmlcleaner.HtmlCleaner;
import org.htmlcleaner.TagNode;
+import us.codecraft.webmagic.utils.Experimental;
import java.util.*;
import java.util.concurrent.atomic.AtomicInteger;
/**
- * readability算法,基础是找到所有p标签的父节点
- * 写的比较乱,最终效果还在尝试中
+ * Extract the text content of html.
+ * Using Readability algorithm: find parents of all p tags.
+ *
* @author code4crafter@gmail.com
- * Date: 13-4-21
- * Time: 下午4:42
+ * @since 0.1.0
*/
+@Experimental
public class SmartContentSelector implements Selector {
private Logger logger = Logger.getLogger(getClass());
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/XpathSelector.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/XpathSelector.java
index 02afe291..a2eab3d9 100644
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/XpathSelector.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/XpathSelector.java
@@ -6,10 +6,10 @@ import java.util.ArrayList;
import java.util.List;
/**
- * xpath的选择器。包装了HtmlCleaner。
+ * XPath selector based on HtmlCleaner。
+ *
* @author code4crafter@gmail.com
- * Date: 13-4-21
- * Time: 上午9:39
+ * @since 0.1.0
*/
public class XpathSelector implements Selector {
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/package.html b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/package.html
index 3c9ef7b2..eb01c3f7 100644
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/package.html
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/package.html
@@ -1,5 +1,5 @@
-提供了便捷抽取页面内容的工具,对外核心接口是Selectable,内部抽取则是通过实现Selector来定制。
+Selectors for page extraction. Core API is the interface Selectable,and internal core is the interface Selector。