From 2c0e1494ca0e48acdb1c221aedc42b25db576874 Mon Sep 17 00:00:00 2001 From: Sutra Zhou Date: Sat, 30 Mar 2024 14:37:24 +0800 Subject: [PATCH] Revert "Refactored code for increased optimization. (#1139)" This reverts commit f051d978e2f329de8f30455e6ab658789e328f1c. --- .../main/java/us/codecraft/webmagic/Page.java | 19 ++---- .../downloader/HttpClientGenerator.java | 3 +- .../webmagic/model/HttpRequestBody.java | 2 +- .../webmagic/selector/ElementsUtil.java | 53 ---------------- .../codecraft/webmagic/selector/HtmlNode.java | 63 ++++++++++++++++--- .../webmagic/configurable/ExtractRule.java | 27 ++++---- .../configurable/SelectorFactory.java | 57 ----------------- 7 files changed, 74 insertions(+), 150 deletions(-) delete mode 100644 webmagic-core/src/main/java/us/codecraft/webmagic/selector/ElementsUtil.java delete mode 100644 webmagic-extension/src/main/java/us/codecraft/webmagic/configurable/SelectorFactory.java diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java index dc87ece8..b4c161a9 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java @@ -169,25 +169,18 @@ public class Page { * @param priority Priority for the URL */ private void addRequestIfValid(String url, long priority) { - boolean isBlankUrl = StringUtils.isBlank(url); - boolean isHashSymbol = url.equals("#"); - boolean isJavaScript = url.startsWith("javascript:"); - - if (isBlankUrl || isHashSymbol || isJavaScript) { - return; // Invalid URL, so no further processing is needed. + if (StringUtils.isBlank(url) || url.equals("#") || url.startsWith("javascript:")) { + return; } String canonicalizedUrl = UrlUtils.canonicalizeUrl(url, this.url.toString()); - Request request = new Request(canonicalizedUrl); - - if (priority > 0) { - request.setPriority(priority); + Request req = new Request(canonicalizedUrl); + if(priority > 0) { + req.setPriority(priority); } - - targetRequests.add(request); + targetRequests.add(req); } - /** * add url to fetch * diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java index f32a4eba..167a5e1c 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java @@ -40,14 +40,13 @@ public class HttpClientGenerator { private PoolingHttpClientConnectionManager connectionManager; - private static final int DEFAULT_MAX_PER_ROUTE = 100; public HttpClientGenerator() { Registry reg = RegistryBuilder.create() .register("http", PlainConnectionSocketFactory.INSTANCE) .register("https", buildSSLConnectionSocketFactory()) .build(); connectionManager = new PoolingHttpClientConnectionManager(reg); - connectionManager.setDefaultMaxPerRoute(DEFAULT_MAX_PER_ROUTE); + connectionManager.setDefaultMaxPerRoute(100); } private SSLConnectionSocketFactory buildSSLConnectionSocketFactory() { diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/model/HttpRequestBody.java b/webmagic-core/src/main/java/us/codecraft/webmagic/model/HttpRequestBody.java index 23606d86..7d3b3078 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/model/HttpRequestBody.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/model/HttpRequestBody.java @@ -64,7 +64,7 @@ public class HttpRequestBody implements Serializable { this.encoding = encoding; } - public static HttpRequestBody createJsonRequestBody(String json, String encoding) { + public static HttpRequestBody json(String json, String encoding) { try { return new HttpRequestBody(json.getBytes(encoding), ContentType.JSON, encoding); } catch (UnsupportedEncodingException e) { diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/ElementsUtil.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/ElementsUtil.java deleted file mode 100644 index 10873c71..00000000 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/ElementsUtil.java +++ /dev/null @@ -1,53 +0,0 @@ -package us.codecraft.webmagic.selector; - -import org.jsoup.nodes.Document; -import org.jsoup.nodes.Element; - -import java.util.ArrayList; -import java.util.List; -import java.util.ListIterator; - -public class ElementsUtil { - HtmlNode htmlNode = new HtmlNode(); - public Selectable selectElements(BaseElementSelector elementSelector) { - ListIterator elementIterator = htmlNode.getElements().listIterator(); - if (!elementSelector.hasAttribute()) { - List resultElements = new ArrayList(); - while (elementIterator.hasNext()) { - Element element = checkElementAndConvert(elementIterator); - List selectElements = elementSelector.selectElements(element); - resultElements.addAll(selectElements); - } - return new HtmlNode(resultElements); - } else { - // has attribute, consider as plaintext - List resultStrings = new ArrayList(); - while (elementIterator.hasNext()) { - Element element = checkElementAndConvert(elementIterator); - List selectList = elementSelector.selectList(element); - resultStrings.addAll(selectList); - } - return new PlainText(resultStrings); - - } - } - - /** - * Only document can be select - * See: https://github.com/code4craft/webmagic/issues/113 - * - * @param elementIterator elementIterator - * @return element element - */ - public Element checkElementAndConvert(ListIterator elementIterator) { - Element element = elementIterator.next(); - if (!(element instanceof Document)) { - Document root = new Document(element.ownerDocument().baseUri()); - Element clone = element.clone(); - root.appendChild(clone); - elementIterator.set(root); - return root; - } - return element; - } -} diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/HtmlNode.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/HtmlNode.java index 32a8b976..85ff5fa6 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/HtmlNode.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/HtmlNode.java @@ -33,22 +33,19 @@ public class HtmlNode extends AbstractSelectable { @Override public Selectable links() { - ElementsUtil elementsUtil = new ElementsUtil(); - return elementsUtil.selectElements(new LinksSelector()); + return selectElements(new LinksSelector()); } @Override public Selectable xpath(String xpath) { - ElementsUtil elementsUtil = new ElementsUtil(); XpathSelector xpathSelector = Selectors.xpath(xpath); - return elementsUtil.selectElements(xpathSelector); + return selectElements(xpathSelector); } @Override public Selectable selectList(Selector selector) { if (selector instanceof BaseElementSelector) { - ElementsUtil elementsUtil = new ElementsUtil(); - return elementsUtil.selectElements((BaseElementSelector) selector); + return selectElements((BaseElementSelector) selector); } return selectList(selector, getSourceTexts()); } @@ -58,18 +55,64 @@ public class HtmlNode extends AbstractSelectable { return selectList(selector); } + /** + * select elements + * + * @param elementSelector elementSelector + * @return result + */ + protected Selectable selectElements(BaseElementSelector elementSelector) { + ListIterator elementIterator = getElements().listIterator(); + if (!elementSelector.hasAttribute()) { + List resultElements = new ArrayList(); + while (elementIterator.hasNext()) { + Element element = checkElementAndConvert(elementIterator); + List selectElements = elementSelector.selectElements(element); + resultElements.addAll(selectElements); + } + return new HtmlNode(resultElements); + } else { + // has attribute, consider as plaintext + List resultStrings = new ArrayList(); + while (elementIterator.hasNext()) { + Element element = checkElementAndConvert(elementIterator); + List selectList = elementSelector.selectList(element); + resultStrings.addAll(selectList); + } + return new PlainText(resultStrings); + + } + } + + /** + * Only document can be select + * See: https://github.com/code4craft/webmagic/issues/113 + * + * @param elementIterator elementIterator + * @return element element + */ + private Element checkElementAndConvert(ListIterator elementIterator) { + Element element = elementIterator.next(); + if (!(element instanceof Document)) { + Document root = new Document(element.ownerDocument().baseUri()); + Element clone = element.clone(); + root.appendChild(clone); + elementIterator.set(root); + return root; + } + return element; + } + @Override public Selectable $(String selector) { - ElementsUtil elementsUtil = new ElementsUtil(); CssSelector cssSelector = Selectors.$(selector); - return elementsUtil.selectElements(cssSelector); + return selectElements(cssSelector); } @Override public Selectable $(String selector, String attrName) { - ElementsUtil elementsUtil = new ElementsUtil(); CssSelector cssSelector = Selectors.$(selector, attrName); - return elementsUtil.selectElements(cssSelector); + return selectElements(cssSelector); } @Override diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/configurable/ExtractRule.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/configurable/ExtractRule.java index 5596cfc7..bbc48dda 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/configurable/ExtractRule.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/configurable/ExtractRule.java @@ -76,27 +76,26 @@ public class ExtractRule { } private Selector compileSelector() { - SelectorFactory factory; switch (expressionType) { case Css: - factory = new CssSelectorFactory(); - break; + if (expressionParams.length >= 1) { + return $(expressionValue, expressionParams[0]); + } else { + return $(expressionValue); + } case XPath: - factory = new XPathSelectorFactory(); - break; + return xpath(expressionValue); case Regex: - factory = new RegexSelectorFactory(); - break; + if (expressionParams.length >= 1) { + return regex(expressionValue, Integer.parseInt(expressionParams[0])); + } else { + return regex(expressionValue); + } case JsonPath: - factory = new JsonPathSelectorFactory(); - break; + return new JsonPathSelector(expressionValue); default: - factory = new XPathSelectorFactory(); // Default to XPath + return xpath(expressionValue); } - - SelectorCompiler selectorCompiler = new SelectorCompiler(factory); - Selector compiledSelector = selectorCompiler.compileSelector(expressionValue, expressionParams); - return compiledSelector; } public void setSelector(Selector selector) { diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/configurable/SelectorFactory.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/configurable/SelectorFactory.java deleted file mode 100644 index 7bca4ba7..00000000 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/configurable/SelectorFactory.java +++ /dev/null @@ -1,57 +0,0 @@ -package us.codecraft.webmagic.configurable; - -import us.codecraft.webmagic.selector.JsonPathSelector; -import us.codecraft.webmagic.selector.Selector; - -import static us.codecraft.webmagic.selector.Selectors.*; -public interface SelectorFactory { - Selector compileSelector(String expressionValue, String[] expressionParams); -} - -class CssSelectorFactory implements SelectorFactory { - @Override - public Selector compileSelector(String expressionValue, String[] expressionParams) { - if (expressionParams.length >= 1) { - return $(expressionValue, expressionParams[0]); - } else { - return $(expressionValue); - } - } -} - -class XPathSelectorFactory implements SelectorFactory { - @Override - public Selector compileSelector(String expressionValue, String[] expressionParams) { - return xpath(expressionValue); - } -} - -class RegexSelectorFactory implements SelectorFactory { - @Override - public Selector compileSelector(String expressionValue, String[] expressionParams) { - if (expressionParams.length >= 1) { - return regex(expressionValue, Integer.parseInt(expressionParams[0])); - } else { - return regex(expressionValue); - } - } -} - -class JsonPathSelectorFactory implements SelectorFactory { - @Override - public Selector compileSelector(String expressionValue, String[] expressionParams) { - return new JsonPathSelector(expressionValue); - } -} - -class SelectorCompiler { - private final SelectorFactory selectorFactory; - - public SelectorCompiler(SelectorFactory selectorFactory) { - this.selectorFactory = selectorFactory; - } - - public Selector compileSelector(String expressionValue, String[] expressionParams) { - return selectorFactory.compileSelector(expressionValue, expressionParams); - } -} \ No newline at end of file