From 791520e6a0a0315f24c6e5030795658c38a0c125 Mon Sep 17 00:00:00 2001 From: mei Date: Fri, 17 Mar 2017 00:06:15 +0800 Subject: [PATCH] fix a bug of RegexSelector when regex has zero-width assertions. --- .../webmagic/selector/RegexSelector.java | 27 +++++++++++++++++-- .../webmagic/selector/RegexSelectorTest.java | 16 +++++++++++ 2 files changed, 41 insertions(+), 2 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/RegexSelector.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/RegexSelector.java index 43818965..584cf900 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/RegexSelector.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/RegexSelector.java @@ -28,8 +28,7 @@ public class RegexSelector implements Selector { } // Check bracket for regex group. Add default group 1 if there is no group. // Only check if there exists the valid left parenthesis, leave regexp validation for Pattern. - if (StringUtils.countMatches(regexStr, "(") - StringUtils.countMatches(regexStr, "\\(") == - StringUtils.countMatches(regexStr, "(?:") - StringUtils.countMatches(regexStr, "\\(?:")) { + if ( ! hasGroup(regexStr) ){ regexStr = "(" + regexStr + ")"; } this.regexStr = regexStr; @@ -45,6 +44,30 @@ public class RegexSelector implements Selector { this(regexStr, 1); } + private boolean hasGroup(String regexStr) { + if (StringUtils.countMatches(regexStr, "(") - StringUtils.countMatches(regexStr, "\\(") == + StringUtils.countMatches(regexStr, "(?:") - StringUtils.countMatches(regexStr, "\\(?:")){ + return false; + } + if (StringUtils.countMatches(regexStr, "(") - StringUtils.countMatches(regexStr, "\\(") == + StringUtils.countMatches(regexStr, "(?=") - StringUtils.countMatches(regexStr, "\\(?=") ) { + return false; + } + if (StringUtils.countMatches(regexStr, "(") - StringUtils.countMatches(regexStr, "\\(") == + StringUtils.countMatches(regexStr, "(?<") - StringUtils.countMatches(regexStr, "\\(?<") ) { + return false; + } + if (StringUtils.countMatches(regexStr, "(") - StringUtils.countMatches(regexStr, "\\(") == + StringUtils.countMatches(regexStr, "(?!") - StringUtils.countMatches(regexStr, "\\(?!") ) { + return false; + } + if (StringUtils.countMatches(regexStr, "(") - StringUtils.countMatches(regexStr, "\\(") == + StringUtils.countMatches(regexStr, "(?#") - StringUtils.countMatches(regexStr, "\\(?#") ) { + return false; + } + return true; + } + @Override public String select(String text) { return selectGroup(text).get(group); diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/selector/RegexSelectorTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/selector/RegexSelectorTest.java index 63e8e43b..144e6fe2 100644 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/selector/RegexSelectorTest.java +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/selector/RegexSelectorTest.java @@ -22,4 +22,20 @@ public class RegexSelectorTest { String select = regexSelector.select(source); Assertions.assertThat(select).isEqualTo(source); } + + @Test + public void testRegexWithZeroWidthAssertions() { + String regex = "^.*(?=\\?)"; + String source = "hello world?xxxx"; + RegexSelector regexSelector = new RegexSelector(regex); + String select = regexSelector.select(source); + Assertions.assertThat(select).isEqualTo("hello world"); + + + regex = "\\d{3}(?!\\d)"; + source = "123456asdf"; + regexSelector = new RegexSelector(regex); + select = regexSelector.select(source); + Assertions.assertThat(select).isEqualTo("456"); + } }