From f051d978e2f329de8f30455e6ab658789e328f1c Mon Sep 17 00:00:00 2001 From: Parthgajera056 <149322319+Parthgajera056@users.noreply.github.com> Date: Sat, 30 Mar 2024 03:28:02 -0300 Subject: [PATCH] Refactored code for increased optimization. (#1139) * refactoring by decompose conditional technique * refactoring by introduction explaining variable technique * refactoring by rename method/variable technique * refactoring by introducing explaining variable technique * Added Extract class refactoring to increase maintainablilty * Refactoring using replace conditional with polymorphism --- .../main/java/us/codecraft/webmagic/Page.java | 19 ++++-- .../downloader/HttpClientGenerator.java | 3 +- .../webmagic/model/HttpRequestBody.java | 2 +- .../webmagic/selector/ElementsUtil.java | 53 ++++++++++++++++ .../codecraft/webmagic/selector/HtmlNode.java | 63 +++---------------- .../webmagic/configurable/ExtractRule.java | 27 ++++---- .../configurable/SelectorFactory.java | 57 +++++++++++++++++ 7 files changed, 150 insertions(+), 74 deletions(-) create mode 100644 webmagic-core/src/main/java/us/codecraft/webmagic/selector/ElementsUtil.java create mode 100644 webmagic-extension/src/main/java/us/codecraft/webmagic/configurable/SelectorFactory.java diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java index b4c161a9..dc87ece8 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java @@ -169,18 +169,25 @@ public class Page { * @param priority Priority for the URL */ private void addRequestIfValid(String url, long priority) { - if (StringUtils.isBlank(url) || url.equals("#") || url.startsWith("javascript:")) { - return; + boolean isBlankUrl = StringUtils.isBlank(url); + boolean isHashSymbol = url.equals("#"); + boolean isJavaScript = url.startsWith("javascript:"); + + if (isBlankUrl || isHashSymbol || isJavaScript) { + return; // Invalid URL, so no further processing is needed. } String canonicalizedUrl = UrlUtils.canonicalizeUrl(url, this.url.toString()); - Request req = new Request(canonicalizedUrl); - if(priority > 0) { - req.setPriority(priority); + Request request = new Request(canonicalizedUrl); + + if (priority > 0) { + request.setPriority(priority); } - targetRequests.add(req); + + targetRequests.add(request); } + /** * add url to fetch * diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java index 167a5e1c..f32a4eba 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java @@ -40,13 +40,14 @@ public class HttpClientGenerator { private PoolingHttpClientConnectionManager connectionManager; + private static final int DEFAULT_MAX_PER_ROUTE = 100; public HttpClientGenerator() { Registry reg = RegistryBuilder.create() .register("http", PlainConnectionSocketFactory.INSTANCE) .register("https", buildSSLConnectionSocketFactory()) .build(); connectionManager = new PoolingHttpClientConnectionManager(reg); - connectionManager.setDefaultMaxPerRoute(100); + connectionManager.setDefaultMaxPerRoute(DEFAULT_MAX_PER_ROUTE); } private SSLConnectionSocketFactory buildSSLConnectionSocketFactory() { diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/model/HttpRequestBody.java b/webmagic-core/src/main/java/us/codecraft/webmagic/model/HttpRequestBody.java index 7d3b3078..23606d86 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/model/HttpRequestBody.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/model/HttpRequestBody.java @@ -64,7 +64,7 @@ public class HttpRequestBody implements Serializable { this.encoding = encoding; } - public static HttpRequestBody json(String json, String encoding) { + public static HttpRequestBody createJsonRequestBody(String json, String encoding) { try { return new HttpRequestBody(json.getBytes(encoding), ContentType.JSON, encoding); } catch (UnsupportedEncodingException e) { diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/ElementsUtil.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/ElementsUtil.java new file mode 100644 index 00000000..10873c71 --- /dev/null +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/ElementsUtil.java @@ -0,0 +1,53 @@ +package us.codecraft.webmagic.selector; + +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; + +import java.util.ArrayList; +import java.util.List; +import java.util.ListIterator; + +public class ElementsUtil { + HtmlNode htmlNode = new HtmlNode(); + public Selectable selectElements(BaseElementSelector elementSelector) { + ListIterator elementIterator = htmlNode.getElements().listIterator(); + if (!elementSelector.hasAttribute()) { + List resultElements = new ArrayList(); + while (elementIterator.hasNext()) { + Element element = checkElementAndConvert(elementIterator); + List selectElements = elementSelector.selectElements(element); + resultElements.addAll(selectElements); + } + return new HtmlNode(resultElements); + } else { + // has attribute, consider as plaintext + List resultStrings = new ArrayList(); + while (elementIterator.hasNext()) { + Element element = checkElementAndConvert(elementIterator); + List selectList = elementSelector.selectList(element); + resultStrings.addAll(selectList); + } + return new PlainText(resultStrings); + + } + } + + /** + * Only document can be select + * See: https://github.com/code4craft/webmagic/issues/113 + * + * @param elementIterator elementIterator + * @return element element + */ + public Element checkElementAndConvert(ListIterator elementIterator) { + Element element = elementIterator.next(); + if (!(element instanceof Document)) { + Document root = new Document(element.ownerDocument().baseUri()); + Element clone = element.clone(); + root.appendChild(clone); + elementIterator.set(root); + return root; + } + return element; + } +} diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/HtmlNode.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/HtmlNode.java index 85ff5fa6..32a8b976 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/HtmlNode.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/HtmlNode.java @@ -33,19 +33,22 @@ public class HtmlNode extends AbstractSelectable { @Override public Selectable links() { - return selectElements(new LinksSelector()); + ElementsUtil elementsUtil = new ElementsUtil(); + return elementsUtil.selectElements(new LinksSelector()); } @Override public Selectable xpath(String xpath) { + ElementsUtil elementsUtil = new ElementsUtil(); XpathSelector xpathSelector = Selectors.xpath(xpath); - return selectElements(xpathSelector); + return elementsUtil.selectElements(xpathSelector); } @Override public Selectable selectList(Selector selector) { if (selector instanceof BaseElementSelector) { - return selectElements((BaseElementSelector) selector); + ElementsUtil elementsUtil = new ElementsUtil(); + return elementsUtil.selectElements((BaseElementSelector) selector); } return selectList(selector, getSourceTexts()); } @@ -55,64 +58,18 @@ public class HtmlNode extends AbstractSelectable { return selectList(selector); } - /** - * select elements - * - * @param elementSelector elementSelector - * @return result - */ - protected Selectable selectElements(BaseElementSelector elementSelector) { - ListIterator elementIterator = getElements().listIterator(); - if (!elementSelector.hasAttribute()) { - List resultElements = new ArrayList(); - while (elementIterator.hasNext()) { - Element element = checkElementAndConvert(elementIterator); - List selectElements = elementSelector.selectElements(element); - resultElements.addAll(selectElements); - } - return new HtmlNode(resultElements); - } else { - // has attribute, consider as plaintext - List resultStrings = new ArrayList(); - while (elementIterator.hasNext()) { - Element element = checkElementAndConvert(elementIterator); - List selectList = elementSelector.selectList(element); - resultStrings.addAll(selectList); - } - return new PlainText(resultStrings); - - } - } - - /** - * Only document can be select - * See: https://github.com/code4craft/webmagic/issues/113 - * - * @param elementIterator elementIterator - * @return element element - */ - private Element checkElementAndConvert(ListIterator elementIterator) { - Element element = elementIterator.next(); - if (!(element instanceof Document)) { - Document root = new Document(element.ownerDocument().baseUri()); - Element clone = element.clone(); - root.appendChild(clone); - elementIterator.set(root); - return root; - } - return element; - } - @Override public Selectable $(String selector) { + ElementsUtil elementsUtil = new ElementsUtil(); CssSelector cssSelector = Selectors.$(selector); - return selectElements(cssSelector); + return elementsUtil.selectElements(cssSelector); } @Override public Selectable $(String selector, String attrName) { + ElementsUtil elementsUtil = new ElementsUtil(); CssSelector cssSelector = Selectors.$(selector, attrName); - return selectElements(cssSelector); + return elementsUtil.selectElements(cssSelector); } @Override diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/configurable/ExtractRule.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/configurable/ExtractRule.java index bbc48dda..5596cfc7 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/configurable/ExtractRule.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/configurable/ExtractRule.java @@ -76,26 +76,27 @@ public class ExtractRule { } private Selector compileSelector() { + SelectorFactory factory; switch (expressionType) { case Css: - if (expressionParams.length >= 1) { - return $(expressionValue, expressionParams[0]); - } else { - return $(expressionValue); - } + factory = new CssSelectorFactory(); + break; case XPath: - return xpath(expressionValue); + factory = new XPathSelectorFactory(); + break; case Regex: - if (expressionParams.length >= 1) { - return regex(expressionValue, Integer.parseInt(expressionParams[0])); - } else { - return regex(expressionValue); - } + factory = new RegexSelectorFactory(); + break; case JsonPath: - return new JsonPathSelector(expressionValue); + factory = new JsonPathSelectorFactory(); + break; default: - return xpath(expressionValue); + factory = new XPathSelectorFactory(); // Default to XPath } + + SelectorCompiler selectorCompiler = new SelectorCompiler(factory); + Selector compiledSelector = selectorCompiler.compileSelector(expressionValue, expressionParams); + return compiledSelector; } public void setSelector(Selector selector) { diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/configurable/SelectorFactory.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/configurable/SelectorFactory.java new file mode 100644 index 00000000..7bca4ba7 --- /dev/null +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/configurable/SelectorFactory.java @@ -0,0 +1,57 @@ +package us.codecraft.webmagic.configurable; + +import us.codecraft.webmagic.selector.JsonPathSelector; +import us.codecraft.webmagic.selector.Selector; + +import static us.codecraft.webmagic.selector.Selectors.*; +public interface SelectorFactory { + Selector compileSelector(String expressionValue, String[] expressionParams); +} + +class CssSelectorFactory implements SelectorFactory { + @Override + public Selector compileSelector(String expressionValue, String[] expressionParams) { + if (expressionParams.length >= 1) { + return $(expressionValue, expressionParams[0]); + } else { + return $(expressionValue); + } + } +} + +class XPathSelectorFactory implements SelectorFactory { + @Override + public Selector compileSelector(String expressionValue, String[] expressionParams) { + return xpath(expressionValue); + } +} + +class RegexSelectorFactory implements SelectorFactory { + @Override + public Selector compileSelector(String expressionValue, String[] expressionParams) { + if (expressionParams.length >= 1) { + return regex(expressionValue, Integer.parseInt(expressionParams[0])); + } else { + return regex(expressionValue); + } + } +} + +class JsonPathSelectorFactory implements SelectorFactory { + @Override + public Selector compileSelector(String expressionValue, String[] expressionParams) { + return new JsonPathSelector(expressionValue); + } +} + +class SelectorCompiler { + private final SelectorFactory selectorFactory; + + public SelectorCompiler(SelectorFactory selectorFactory) { + this.selectorFactory = selectorFactory; + } + + public Selector compileSelector(String expressionValue, String[] expressionParams) { + return selectorFactory.compileSelector(expressionValue, expressionParams); + } +} \ No newline at end of file