Refactored code for increased optimization. (#1139)

* refactoring by decompose conditional technique

* refactoring by introduction explaining variable technique

* refactoring by rename method/variable technique

* refactoring by introducing explaining variable technique

* Added Extract class refactoring to increase maintainablilty

* Refactoring using replace conditional with polymorphism
pull/1153/head
Parthgajera056 11 months ago committed by GitHub
parent 9b9f173c1c
commit f051d978e2
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

@ -169,18 +169,25 @@ public class Page {
* @param priority Priority for the URL * @param priority Priority for the URL
*/ */
private void addRequestIfValid(String url, long priority) { private void addRequestIfValid(String url, long priority) {
if (StringUtils.isBlank(url) || url.equals("#") || url.startsWith("javascript:")) { boolean isBlankUrl = StringUtils.isBlank(url);
return; boolean isHashSymbol = url.equals("#");
boolean isJavaScript = url.startsWith("javascript:");
if (isBlankUrl || isHashSymbol || isJavaScript) {
return; // Invalid URL, so no further processing is needed.
} }
String canonicalizedUrl = UrlUtils.canonicalizeUrl(url, this.url.toString()); String canonicalizedUrl = UrlUtils.canonicalizeUrl(url, this.url.toString());
Request req = new Request(canonicalizedUrl); Request request = new Request(canonicalizedUrl);
if(priority > 0) {
req.setPriority(priority); if (priority > 0) {
request.setPriority(priority);
} }
targetRequests.add(req);
targetRequests.add(request);
} }
/** /**
* add url to fetch * add url to fetch
* *

@ -40,13 +40,14 @@ public class HttpClientGenerator {
private PoolingHttpClientConnectionManager connectionManager; private PoolingHttpClientConnectionManager connectionManager;
private static final int DEFAULT_MAX_PER_ROUTE = 100;
public HttpClientGenerator() { public HttpClientGenerator() {
Registry<ConnectionSocketFactory> reg = RegistryBuilder.<ConnectionSocketFactory>create() Registry<ConnectionSocketFactory> reg = RegistryBuilder.<ConnectionSocketFactory>create()
.register("http", PlainConnectionSocketFactory.INSTANCE) .register("http", PlainConnectionSocketFactory.INSTANCE)
.register("https", buildSSLConnectionSocketFactory()) .register("https", buildSSLConnectionSocketFactory())
.build(); .build();
connectionManager = new PoolingHttpClientConnectionManager(reg); connectionManager = new PoolingHttpClientConnectionManager(reg);
connectionManager.setDefaultMaxPerRoute(100); connectionManager.setDefaultMaxPerRoute(DEFAULT_MAX_PER_ROUTE);
} }
private SSLConnectionSocketFactory buildSSLConnectionSocketFactory() { private SSLConnectionSocketFactory buildSSLConnectionSocketFactory() {

@ -64,7 +64,7 @@ public class HttpRequestBody implements Serializable {
this.encoding = encoding; this.encoding = encoding;
} }
public static HttpRequestBody json(String json, String encoding) { public static HttpRequestBody createJsonRequestBody(String json, String encoding) {
try { try {
return new HttpRequestBody(json.getBytes(encoding), ContentType.JSON, encoding); return new HttpRequestBody(json.getBytes(encoding), ContentType.JSON, encoding);
} catch (UnsupportedEncodingException e) { } catch (UnsupportedEncodingException e) {

@ -0,0 +1,53 @@
package us.codecraft.webmagic.selector;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import java.util.ArrayList;
import java.util.List;
import java.util.ListIterator;
public class ElementsUtil {
HtmlNode htmlNode = new HtmlNode();
public Selectable selectElements(BaseElementSelector elementSelector) {
ListIterator<Element> elementIterator = htmlNode.getElements().listIterator();
if (!elementSelector.hasAttribute()) {
List<Element> resultElements = new ArrayList<Element>();
while (elementIterator.hasNext()) {
Element element = checkElementAndConvert(elementIterator);
List<Element> selectElements = elementSelector.selectElements(element);
resultElements.addAll(selectElements);
}
return new HtmlNode(resultElements);
} else {
// has attribute, consider as plaintext
List<String> resultStrings = new ArrayList<String>();
while (elementIterator.hasNext()) {
Element element = checkElementAndConvert(elementIterator);
List<String> selectList = elementSelector.selectList(element);
resultStrings.addAll(selectList);
}
return new PlainText(resultStrings);
}
}
/**
* Only document can be select
* See: https://github.com/code4craft/webmagic/issues/113
*
* @param elementIterator elementIterator
* @return element element
*/
public Element checkElementAndConvert(ListIterator<Element> elementIterator) {
Element element = elementIterator.next();
if (!(element instanceof Document)) {
Document root = new Document(element.ownerDocument().baseUri());
Element clone = element.clone();
root.appendChild(clone);
elementIterator.set(root);
return root;
}
return element;
}
}

@ -33,19 +33,22 @@ public class HtmlNode extends AbstractSelectable {
@Override @Override
public Selectable links() { public Selectable links() {
return selectElements(new LinksSelector()); ElementsUtil elementsUtil = new ElementsUtil();
return elementsUtil.selectElements(new LinksSelector());
} }
@Override @Override
public Selectable xpath(String xpath) { public Selectable xpath(String xpath) {
ElementsUtil elementsUtil = new ElementsUtil();
XpathSelector xpathSelector = Selectors.xpath(xpath); XpathSelector xpathSelector = Selectors.xpath(xpath);
return selectElements(xpathSelector); return elementsUtil.selectElements(xpathSelector);
} }
@Override @Override
public Selectable selectList(Selector selector) { public Selectable selectList(Selector selector) {
if (selector instanceof BaseElementSelector) { if (selector instanceof BaseElementSelector) {
return selectElements((BaseElementSelector) selector); ElementsUtil elementsUtil = new ElementsUtil();
return elementsUtil.selectElements((BaseElementSelector) selector);
} }
return selectList(selector, getSourceTexts()); return selectList(selector, getSourceTexts());
} }
@ -55,64 +58,18 @@ public class HtmlNode extends AbstractSelectable {
return selectList(selector); return selectList(selector);
} }
/**
* select elements
*
* @param elementSelector elementSelector
* @return result
*/
protected Selectable selectElements(BaseElementSelector elementSelector) {
ListIterator<Element> elementIterator = getElements().listIterator();
if (!elementSelector.hasAttribute()) {
List<Element> resultElements = new ArrayList<Element>();
while (elementIterator.hasNext()) {
Element element = checkElementAndConvert(elementIterator);
List<Element> selectElements = elementSelector.selectElements(element);
resultElements.addAll(selectElements);
}
return new HtmlNode(resultElements);
} else {
// has attribute, consider as plaintext
List<String> resultStrings = new ArrayList<String>();
while (elementIterator.hasNext()) {
Element element = checkElementAndConvert(elementIterator);
List<String> selectList = elementSelector.selectList(element);
resultStrings.addAll(selectList);
}
return new PlainText(resultStrings);
}
}
/**
* Only document can be select
* See: https://github.com/code4craft/webmagic/issues/113
*
* @param elementIterator elementIterator
* @return element element
*/
private Element checkElementAndConvert(ListIterator<Element> elementIterator) {
Element element = elementIterator.next();
if (!(element instanceof Document)) {
Document root = new Document(element.ownerDocument().baseUri());
Element clone = element.clone();
root.appendChild(clone);
elementIterator.set(root);
return root;
}
return element;
}
@Override @Override
public Selectable $(String selector) { public Selectable $(String selector) {
ElementsUtil elementsUtil = new ElementsUtil();
CssSelector cssSelector = Selectors.$(selector); CssSelector cssSelector = Selectors.$(selector);
return selectElements(cssSelector); return elementsUtil.selectElements(cssSelector);
} }
@Override @Override
public Selectable $(String selector, String attrName) { public Selectable $(String selector, String attrName) {
ElementsUtil elementsUtil = new ElementsUtil();
CssSelector cssSelector = Selectors.$(selector, attrName); CssSelector cssSelector = Selectors.$(selector, attrName);
return selectElements(cssSelector); return elementsUtil.selectElements(cssSelector);
} }
@Override @Override

@ -76,26 +76,27 @@ public class ExtractRule {
} }
private Selector compileSelector() { private Selector compileSelector() {
SelectorFactory factory;
switch (expressionType) { switch (expressionType) {
case Css: case Css:
if (expressionParams.length >= 1) { factory = new CssSelectorFactory();
return $(expressionValue, expressionParams[0]); break;
} else {
return $(expressionValue);
}
case XPath: case XPath:
return xpath(expressionValue); factory = new XPathSelectorFactory();
break;
case Regex: case Regex:
if (expressionParams.length >= 1) { factory = new RegexSelectorFactory();
return regex(expressionValue, Integer.parseInt(expressionParams[0])); break;
} else {
return regex(expressionValue);
}
case JsonPath: case JsonPath:
return new JsonPathSelector(expressionValue); factory = new JsonPathSelectorFactory();
break;
default: default:
return xpath(expressionValue); factory = new XPathSelectorFactory(); // Default to XPath
} }
SelectorCompiler selectorCompiler = new SelectorCompiler(factory);
Selector compiledSelector = selectorCompiler.compileSelector(expressionValue, expressionParams);
return compiledSelector;
} }
public void setSelector(Selector selector) { public void setSelector(Selector selector) {

@ -0,0 +1,57 @@
package us.codecraft.webmagic.configurable;
import us.codecraft.webmagic.selector.JsonPathSelector;
import us.codecraft.webmagic.selector.Selector;
import static us.codecraft.webmagic.selector.Selectors.*;
public interface SelectorFactory {
Selector compileSelector(String expressionValue, String[] expressionParams);
}
class CssSelectorFactory implements SelectorFactory {
@Override
public Selector compileSelector(String expressionValue, String[] expressionParams) {
if (expressionParams.length >= 1) {
return $(expressionValue, expressionParams[0]);
} else {
return $(expressionValue);
}
}
}
class XPathSelectorFactory implements SelectorFactory {
@Override
public Selector compileSelector(String expressionValue, String[] expressionParams) {
return xpath(expressionValue);
}
}
class RegexSelectorFactory implements SelectorFactory {
@Override
public Selector compileSelector(String expressionValue, String[] expressionParams) {
if (expressionParams.length >= 1) {
return regex(expressionValue, Integer.parseInt(expressionParams[0]));
} else {
return regex(expressionValue);
}
}
}
class JsonPathSelectorFactory implements SelectorFactory {
@Override
public Selector compileSelector(String expressionValue, String[] expressionParams) {
return new JsonPathSelector(expressionValue);
}
}
class SelectorCompiler {
private final SelectorFactory selectorFactory;
public SelectorCompiler(SelectorFactory selectorFactory) {
this.selectorFactory = selectorFactory;
}
public Selector compileSelector(String expressionValue, String[] expressionParams) {
return selectorFactory.compileSelector(expressionValue, expressionParams);
}
}
Loading…
Cancel
Save