update comments for selector

pull/17/head
yihua.huang 12 years ago
parent 77e6ca2945
commit 17f8ead28f

@ -4,10 +4,10 @@ import java.util.ArrayList;
import java.util.List; import java.util.List;
/** /**
* html<br> * Selectable plain text.<br>
*
* @author code4crafter@gmail.com <br> * @author code4crafter@gmail.com <br>
* Date: 13-4-21 * @since 0.1.0
* Time: 7:54
*/ */
public class Html extends PlainText { public class Html extends PlainText {
@ -66,7 +66,7 @@ public class Html extends PlainText {
@Override @Override
public Selectable $(String selector) { public Selectable $(String selector) {
CssSelector cssSelector = new CssSelector(selector); CssSelector cssSelector = new CssSelector(selector);
return selectList(cssSelector,strings); return selectList(cssSelector, strings);
} }
} }

@ -1,10 +1,10 @@
package us.codecraft.webmagic.selector; package us.codecraft.webmagic.selector;
/** /**
* <br> * Object contains regex results.<br>
*
* @author code4crafter@gmail.com <br> * @author code4crafter@gmail.com <br>
* Date: 13-4-21 * @since 0.1.0
* Time: 7:39
*/ */
class RegexResult { class RegexResult {

@ -9,10 +9,10 @@ import java.util.regex.Pattern;
import java.util.regex.PatternSyntaxException; import java.util.regex.PatternSyntaxException;
/** /**
* <br> * Selector in regex.<br>
*
* @author code4crafter@gmail.com <br> * @author code4crafter@gmail.com <br>
* Date: 13-4-21 * @since 0.1.0
* Time: 7:09
*/ */
public class RegexSelector implements Selector { public class RegexSelector implements Selector {
@ -21,18 +21,18 @@ public class RegexSelector implements Selector {
private Pattern regex; private Pattern regex;
public RegexSelector(String regexStr) { public RegexSelector(String regexStr) {
if (StringUtils.isBlank(regexStr)){ if (StringUtils.isBlank(regexStr)) {
throw new IllegalArgumentException("regex must not be empty"); throw new IllegalArgumentException("regex must not be empty");
} }
if (!StringUtils.contains(regexStr,"(")&&!StringUtils.contains(regexStr,")")){ if (!StringUtils.contains(regexStr, "(") && !StringUtils.contains(regexStr, ")")) {
regexStr="("+regexStr+")"; regexStr = "(" + regexStr + ")";
} }
if (!StringUtils.contains(regexStr,"(")||!StringUtils.contains(regexStr,")")){ if (!StringUtils.contains(regexStr, "(") || !StringUtils.contains(regexStr, ")")) {
throw new IllegalArgumentException("regex must have capture group 1"); throw new IllegalArgumentException("regex must have capture group 1");
} }
this.regexStr = regexStr; this.regexStr = regexStr;
try { try {
regex = Pattern.compile(regexStr,Pattern.DOTALL|Pattern.CASE_INSENSITIVE); regex = Pattern.compile(regexStr, Pattern.DOTALL | Pattern.CASE_INSENSITIVE);
} catch (PatternSyntaxException e) { } catch (PatternSyntaxException e) {
throw new IllegalArgumentException("invalid regex", e); throw new IllegalArgumentException("invalid regex", e);
} }
@ -45,7 +45,7 @@ public class RegexSelector implements Selector {
@Override @Override
public List<String> selectList(String text) { public List<String> selectList(String text) {
List<String> strings=new ArrayList<String>(); List<String> strings = new ArrayList<String>();
List<RegexResult> results = selectGroupList(text); List<RegexResult> results = selectGroupList(text);
for (RegexResult result : results) { for (RegexResult result : results) {
strings.add(result.get(1)); strings.add(result.get(1));
@ -56,7 +56,7 @@ public class RegexSelector implements Selector {
public RegexResult selectGroup(String text) { public RegexResult selectGroup(String text) {
Matcher matcher = regex.matcher(text); Matcher matcher = regex.matcher(text);
if (matcher.find()) { if (matcher.find()) {
String[] groups = new String[matcher.groupCount()+1]; String[] groups = new String[matcher.groupCount() + 1];
for (int i = 0; i < groups.length; i++) { for (int i = 0; i < groups.length; i++) {
groups[i] = matcher.group(i); groups[i] = matcher.group(i);
} }
@ -69,7 +69,7 @@ public class RegexSelector implements Selector {
Matcher matcher = regex.matcher(text); Matcher matcher = regex.matcher(text);
List<RegexResult> resultList = new ArrayList<RegexResult>(); List<RegexResult> resultList = new ArrayList<RegexResult>();
while (matcher.find()) { while (matcher.find()) {
String[] groups = new String[matcher.groupCount()+1]; String[] groups = new String[matcher.groupCount() + 1];
for (int i = 0; i < groups.length; i++) { for (int i = 0; i < groups.length; i++) {
groups[i] = matcher.group(i); groups[i] = matcher.group(i);
} }

@ -6,10 +6,10 @@ import java.util.regex.Pattern;
import java.util.regex.PatternSyntaxException; import java.util.regex.PatternSyntaxException;
/** /**
* <br> * Replace selector<br>
*
* @author code4crafter@gmail.com <br> * @author code4crafter@gmail.com <br>
* Date: 13-4-21 * @since 0.1.0
* Time: 7:09
*/ */
public class ReplaceSelector implements Selector { public class ReplaceSelector implements Selector {

@ -3,10 +3,10 @@ package us.codecraft.webmagic.selector;
import java.util.List; import java.util.List;
/** /**
* <br> * Selectable text.<br>
*
* @author code4crafter@gmail.com <br> * @author code4crafter@gmail.com <br>
* Date: 13-4-20 * @since 0.1.0
* Time: 7:51
*/ */
public interface Selectable { public interface Selectable {

@ -3,15 +3,24 @@ package us.codecraft.webmagic.selector;
import java.util.List; import java.util.List;
/** /**
* <br> * Selector(extractor) for text.<br>
* @author code4crafter@gmail.com <br> * @author code4crafter@gmail.com <br>
* Date: 13-4-20
* Time: 8:02
*/ */
public interface Selector { public interface Selector {
/**
* Extract single result in text.<br>
* If there are more than one result, only the first will be chosen.
* @param text
* @return result
*/
public String select(String text); public String select(String text);
/**
* Extract all results in text.<br>
* @param text
* @return results
*/
public List<String> selectList(String text); public List<String> selectList(String text);
} }

@ -7,10 +7,10 @@ import java.util.Map;
import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.ConcurrentHashMap;
/** /**
* selector<br> * Selector factory with some inner cache.<br>
*
* @author code4crafter@gmail.com <br> * @author code4crafter@gmail.com <br>
* Date: 13-4-21 * @since 0.1.0
* Time: 7:56
*/ */
public class SelectorFactory { public class SelectorFactory {
@ -34,7 +34,7 @@ public class SelectorFactory {
return newSelector(XpathSelector.class, xpath); return newSelector(XpathSelector.class, xpath);
} }
public SmartContentSelector newSmartContentSelector(){ public SmartContentSelector newSmartContentSelector() {
return newSelector(SmartContentSelector.class); return newSelector(SmartContentSelector.class);
} }

@ -3,17 +3,19 @@ package us.codecraft.webmagic.selector;
import org.apache.log4j.Logger; import org.apache.log4j.Logger;
import org.htmlcleaner.HtmlCleaner; import org.htmlcleaner.HtmlCleaner;
import org.htmlcleaner.TagNode; import org.htmlcleaner.TagNode;
import us.codecraft.webmagic.utils.Experimental;
import java.util.*; import java.util.*;
import java.util.concurrent.atomic.AtomicInteger; import java.util.concurrent.atomic.AtomicInteger;
/** /**
* readabilityp * Extract the text content of html.<br>
* * Using Readability algorithm: find parents of all p tags.
*
* @author code4crafter@gmail.com <br> * @author code4crafter@gmail.com <br>
* Date: 13-4-21 * @since 0.1.0
* Time: 4:42
*/ */
@Experimental
public class SmartContentSelector implements Selector { public class SmartContentSelector implements Selector {
private Logger logger = Logger.getLogger(getClass()); private Logger logger = Logger.getLogger(getClass());

@ -6,10 +6,10 @@ import java.util.ArrayList;
import java.util.List; import java.util.List;
/** /**
* xpathHtmlCleaner<br> * XPath selector based on HtmlCleaner<br>
*
* @author code4crafter@gmail.com <br> * @author code4crafter@gmail.com <br>
* Date: 13-4-21 * @since 0.1.0
* Time: 9:39
*/ */
public class XpathSelector implements Selector { public class XpathSelector implements Selector {

@ -1,5 +1,5 @@
<html> <html>
<body> <body>
提供了便捷抽取页面内容的工具对外核心接口是Selectable内部抽取则是通过实现Selector来定制 Selectors for page extraction. Core API is the interface Selectableand internal core is the interface Selector
</body> </body>
</html> </html>

Loading…
Cancel
Save