diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/RegexSelector.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/RegexSelector.java index 584cf900..1af6395c 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/RegexSelector.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/RegexSelector.java @@ -26,14 +26,16 @@ public class RegexSelector implements Selector { if (StringUtils.isBlank(regexStr)) { throw new IllegalArgumentException("regex must not be empty"); } - // Check bracket for regex group. Add default group 1 if there is no group. - // Only check if there exists the valid left parenthesis, leave regexp validation for Pattern. - if ( ! hasGroup(regexStr) ){ - regexStr = "(" + regexStr + ")"; - } - this.regexStr = regexStr; + try { regex = Pattern.compile(regexStr, Pattern.DOTALL | Pattern.CASE_INSENSITIVE); + // Check bracket for regex group. Add default group 1 if there is no group. + // Only check if there exists the valid left parenthesis, leave regexp validation for Pattern. + if ( regex.matcher("").groupCount() == 0 ){ + regexStr = "(" + regexStr + ")"; + regex = Pattern.compile(regexStr, Pattern.DOTALL | Pattern.CASE_INSENSITIVE); + } + this.regexStr = regexStr; } catch (PatternSyntaxException e) { throw new IllegalArgumentException("invalid regex", e); } @@ -44,30 +46,6 @@ public class RegexSelector implements Selector { this(regexStr, 1); } - private boolean hasGroup(String regexStr) { - if (StringUtils.countMatches(regexStr, "(") - StringUtils.countMatches(regexStr, "\\(") == - StringUtils.countMatches(regexStr, "(?:") - StringUtils.countMatches(regexStr, "\\(?:")){ - return false; - } - if (StringUtils.countMatches(regexStr, "(") - StringUtils.countMatches(regexStr, "\\(") == - StringUtils.countMatches(regexStr, "(?=") - StringUtils.countMatches(regexStr, "\\(?=") ) { - return false; - } - if (StringUtils.countMatches(regexStr, "(") - StringUtils.countMatches(regexStr, "\\(") == - StringUtils.countMatches(regexStr, "(?<") - StringUtils.countMatches(regexStr, "\\(?<") ) { - return false; - } - if (StringUtils.countMatches(regexStr, "(") - StringUtils.countMatches(regexStr, "\\(") == - StringUtils.countMatches(regexStr, "(?!") - StringUtils.countMatches(regexStr, "\\(?!") ) { - return false; - } - if (StringUtils.countMatches(regexStr, "(") - StringUtils.countMatches(regexStr, "\\(") == - StringUtils.countMatches(regexStr, "(?#") - StringUtils.countMatches(regexStr, "\\(?#") ) { - return false; - } - return true; - } - @Override public String select(String text) { return selectGroup(text).get(group); diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/selector/RegexSelectorTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/selector/RegexSelectorTest.java index 144e6fe2..871caa14 100644 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/selector/RegexSelectorTest.java +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/selector/RegexSelectorTest.java @@ -25,8 +25,8 @@ public class RegexSelectorTest { @Test public void testRegexWithZeroWidthAssertions() { - String regex = "^.*(?=\\?)"; - String source = "hello world?xxxx"; + String regex = "^.*(?=\\?)(?!\\?yy)"; + String source = "hello world?xx?yy"; RegexSelector regexSelector = new RegexSelector(regex); String select = regexSelector.select(source); Assertions.assertThat(select).isEqualTo("hello world");