From 83926970b23856c3d23de69ae1d5d0b8ad7371cb Mon Sep 17 00:00:00 2001 From: Almark Ming Date: Tue, 17 Dec 2013 16:55:53 +0800 Subject: [PATCH 1/2] Check valid left parenthesis --- .../webmagic/selector/RegexSelector.java | 195 +++++++++--------- 1 file changed, 102 insertions(+), 93 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/RegexSelector.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/RegexSelector.java index 6b1db967..d6975905 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/RegexSelector.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/RegexSelector.java @@ -1,93 +1,102 @@ -package us.codecraft.webmagic.selector; - -import org.apache.commons.lang3.StringUtils; - -import java.util.ArrayList; -import java.util.List; -import java.util.regex.Matcher; -import java.util.regex.Pattern; -import java.util.regex.PatternSyntaxException; - -/** - * Selector in regex.
- * - * @author code4crafter@gmail.com
- * @since 0.1.0 - */ -public class RegexSelector implements Selector { - - private String regexStr; - - private Pattern regex; - - private int group = 1; - - public RegexSelector(String regexStr, int group) { - if (StringUtils.isBlank(regexStr)) { - throw new IllegalArgumentException("regex must not be empty"); - } - if (!StringUtils.contains(regexStr, "(") && !StringUtils.contains(regexStr, ")")) { - regexStr = "(" + regexStr + ")"; - } - if (!StringUtils.contains(regexStr, "(") || !StringUtils.contains(regexStr, ")")) { - throw new IllegalArgumentException("regex must have capture group 1"); - } - this.regexStr = regexStr; - try { - regex = Pattern.compile(regexStr, Pattern.DOTALL | Pattern.CASE_INSENSITIVE); - } catch (PatternSyntaxException e) { - throw new IllegalArgumentException("invalid regex", e); - } - this.group = group; - } - - public RegexSelector(String regexStr) { - this(regexStr, 1); - } - - @Override - public String select(String text) { - return selectGroup(text).get(group); - } - - @Override - public List selectList(String text) { - List strings = new ArrayList(); - List results = selectGroupList(text); - for (RegexResult result : results) { - strings.add(result.get(group)); - } - return strings; - } - - public RegexResult selectGroup(String text) { - Matcher matcher = regex.matcher(text); - if (matcher.find()) { - String[] groups = new String[matcher.groupCount() + 1]; - for (int i = 0; i < groups.length; i++) { - groups[i] = matcher.group(i); - } - return new RegexResult(groups); - } - return RegexResult.EMPTY_RESULT; - } - - public List selectGroupList(String text) { - Matcher matcher = regex.matcher(text); - List resultList = new ArrayList(); - while (matcher.find()) { - String[] groups = new String[matcher.groupCount() + 1]; - for (int i = 0; i < groups.length; i++) { - groups[i] = matcher.group(i); - } - resultList.add(new RegexResult(groups)); - } - return resultList; - } - - @Override - public String toString() { - return regexStr; - } - -} +package us.codecraft.webmagic.selector; + +import org.apache.commons.lang3.StringUtils; + +import java.util.ArrayList; +import java.util.List; +import java.util.regex.Matcher; +import java.util.regex.Pattern; +import java.util.regex.PatternSyntaxException; + +/** + * Selector in regex.
+ * + * @author code4crafter@gmail.com
+ * @since 0.1.0 + */ +public class RegexSelector implements Selector { + + private String regexStr; + + private Pattern regex; + + private int group = 1; + + public RegexSelector(String regexStr, int group) { + if (StringUtils.isBlank(regexStr)) { + throw new IllegalArgumentException("regex must not be empty"); + } + /* Can't detect '\(', '(?:)' so that would be result in ArrayIndexOutOfBoundsException + if (!StringUtils.contains(regexStr, "(") && !StringUtils.contains(regexStr, ")")) { + regexStr = "(" + regexStr + ")"; + } + if (!StringUtils.contains(regexStr, "(") || !StringUtils.contains(regexStr, ")")) { + throw new IllegalArgumentException("regex must have capture group 1"); + } + */ + + // Try to fix: Only check if there exists the valid left parenthesis, leave regexp validation for Pattern + if (StringUtils.countMatches(regexStr, "(") - StringUtils.countMatches(regexStr, "\\\\\\(") == + StringUtils.countMatches(regexStr, "(?:") - StringUtils.countMatches(regexStr, "\\\\\\(?:")) { + regexStr = "(" + regexStr + ")"; + } + + this.regexStr = regexStr; + try { + regex = Pattern.compile(regexStr, Pattern.DOTALL | Pattern.CASE_INSENSITIVE); + } catch (PatternSyntaxException e) { + throw new IllegalArgumentException("invalid regex", e); + } + this.group = group; + } + + public RegexSelector(String regexStr) { + this(regexStr, 1); + } + + @Override + public String select(String text) { + return selectGroup(text).get(group); + } + + @Override + public List selectList(String text) { + List strings = new ArrayList(); + List results = selectGroupList(text); + for (RegexResult result : results) { + strings.add(result.get(group)); + } + return strings; + } + + public RegexResult selectGroup(String text) { + Matcher matcher = regex.matcher(text); + if (matcher.find()) { + String[] groups = new String[matcher.groupCount() + 1]; + for (int i = 0; i < groups.length; i++) { + groups[i] = matcher.group(i); + } + return new RegexResult(groups); + } + return RegexResult.EMPTY_RESULT; + } + + public List selectGroupList(String text) { + Matcher matcher = regex.matcher(text); + List resultList = new ArrayList(); + while (matcher.find()) { + String[] groups = new String[matcher.groupCount() + 1]; + for (int i = 0; i < groups.length; i++) { + groups[i] = matcher.group(i); + } + resultList.add(new RegexResult(groups)); + } + return resultList; + } + + @Override + public String toString() { + return regexStr; + } + +} \ No newline at end of file From 91ed66ecacab02dfd4b29d4cfd2fc5f2398fe241 Mon Sep 17 00:00:00 2001 From: Almark Ming Date: Tue, 17 Dec 2013 16:57:22 +0800 Subject: [PATCH 2/2] Update RegexSelector.java --- .../main/java/us/codecraft/webmagic/selector/RegexSelector.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/RegexSelector.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/RegexSelector.java index d6975905..b30bc18a 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/RegexSelector.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/RegexSelector.java @@ -26,7 +26,7 @@ public class RegexSelector implements Selector { if (StringUtils.isBlank(regexStr)) { throw new IllegalArgumentException("regex must not be empty"); } - /* Can't detect '\(', '(?:)' so that would be result in ArrayIndexOutOfBoundsException + /* Can't detect '\(', '(?:)' so that would result in ArrayIndexOutOfBoundsException if (!StringUtils.contains(regexStr, "(") && !StringUtils.contains(regexStr, ")")) { regexStr = "(" + regexStr + ")"; }