Revert "Refactored code for increased optimization. (#1139)"

This reverts commit f051d978e2.
pull/1153/head
Sutra Zhou 10 months ago committed by GitHub
parent f051d978e2
commit 2c0e1494ca
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

@ -169,25 +169,18 @@ public class Page {
* @param priority Priority for the URL
*/
private void addRequestIfValid(String url, long priority) {
boolean isBlankUrl = StringUtils.isBlank(url);
boolean isHashSymbol = url.equals("#");
boolean isJavaScript = url.startsWith("javascript:");
if (isBlankUrl || isHashSymbol || isJavaScript) {
return; // Invalid URL, so no further processing is needed.
if (StringUtils.isBlank(url) || url.equals("#") || url.startsWith("javascript:")) {
return;
}
String canonicalizedUrl = UrlUtils.canonicalizeUrl(url, this.url.toString());
Request request = new Request(canonicalizedUrl);
if (priority > 0) {
request.setPriority(priority);
Request req = new Request(canonicalizedUrl);
if(priority > 0) {
req.setPriority(priority);
}
targetRequests.add(request);
targetRequests.add(req);
}
/**
* add url to fetch
*

@ -40,14 +40,13 @@ public class HttpClientGenerator {
private PoolingHttpClientConnectionManager connectionManager;
private static final int DEFAULT_MAX_PER_ROUTE = 100;
public HttpClientGenerator() {
Registry<ConnectionSocketFactory> reg = RegistryBuilder.<ConnectionSocketFactory>create()
.register("http", PlainConnectionSocketFactory.INSTANCE)
.register("https", buildSSLConnectionSocketFactory())
.build();
connectionManager = new PoolingHttpClientConnectionManager(reg);
connectionManager.setDefaultMaxPerRoute(DEFAULT_MAX_PER_ROUTE);
connectionManager.setDefaultMaxPerRoute(100);
}
private SSLConnectionSocketFactory buildSSLConnectionSocketFactory() {

@ -64,7 +64,7 @@ public class HttpRequestBody implements Serializable {
this.encoding = encoding;
}
public static HttpRequestBody createJsonRequestBody(String json, String encoding) {
public static HttpRequestBody json(String json, String encoding) {
try {
return new HttpRequestBody(json.getBytes(encoding), ContentType.JSON, encoding);
} catch (UnsupportedEncodingException e) {

@ -1,53 +0,0 @@
package us.codecraft.webmagic.selector;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import java.util.ArrayList;
import java.util.List;
import java.util.ListIterator;
public class ElementsUtil {
HtmlNode htmlNode = new HtmlNode();
public Selectable selectElements(BaseElementSelector elementSelector) {
ListIterator<Element> elementIterator = htmlNode.getElements().listIterator();
if (!elementSelector.hasAttribute()) {
List<Element> resultElements = new ArrayList<Element>();
while (elementIterator.hasNext()) {
Element element = checkElementAndConvert(elementIterator);
List<Element> selectElements = elementSelector.selectElements(element);
resultElements.addAll(selectElements);
}
return new HtmlNode(resultElements);
} else {
// has attribute, consider as plaintext
List<String> resultStrings = new ArrayList<String>();
while (elementIterator.hasNext()) {
Element element = checkElementAndConvert(elementIterator);
List<String> selectList = elementSelector.selectList(element);
resultStrings.addAll(selectList);
}
return new PlainText(resultStrings);
}
}
/**
* Only document can be select
* See: https://github.com/code4craft/webmagic/issues/113
*
* @param elementIterator elementIterator
* @return element element
*/
public Element checkElementAndConvert(ListIterator<Element> elementIterator) {
Element element = elementIterator.next();
if (!(element instanceof Document)) {
Document root = new Document(element.ownerDocument().baseUri());
Element clone = element.clone();
root.appendChild(clone);
elementIterator.set(root);
return root;
}
return element;
}
}

@ -33,22 +33,19 @@ public class HtmlNode extends AbstractSelectable {
@Override
public Selectable links() {
ElementsUtil elementsUtil = new ElementsUtil();
return elementsUtil.selectElements(new LinksSelector());
return selectElements(new LinksSelector());
}
@Override
public Selectable xpath(String xpath) {
ElementsUtil elementsUtil = new ElementsUtil();
XpathSelector xpathSelector = Selectors.xpath(xpath);
return elementsUtil.selectElements(xpathSelector);
return selectElements(xpathSelector);
}
@Override
public Selectable selectList(Selector selector) {
if (selector instanceof BaseElementSelector) {
ElementsUtil elementsUtil = new ElementsUtil();
return elementsUtil.selectElements((BaseElementSelector) selector);
return selectElements((BaseElementSelector) selector);
}
return selectList(selector, getSourceTexts());
}
@ -58,18 +55,64 @@ public class HtmlNode extends AbstractSelectable {
return selectList(selector);
}
/**
* select elements
*
* @param elementSelector elementSelector
* @return result
*/
protected Selectable selectElements(BaseElementSelector elementSelector) {
ListIterator<Element> elementIterator = getElements().listIterator();
if (!elementSelector.hasAttribute()) {
List<Element> resultElements = new ArrayList<Element>();
while (elementIterator.hasNext()) {
Element element = checkElementAndConvert(elementIterator);
List<Element> selectElements = elementSelector.selectElements(element);
resultElements.addAll(selectElements);
}
return new HtmlNode(resultElements);
} else {
// has attribute, consider as plaintext
List<String> resultStrings = new ArrayList<String>();
while (elementIterator.hasNext()) {
Element element = checkElementAndConvert(elementIterator);
List<String> selectList = elementSelector.selectList(element);
resultStrings.addAll(selectList);
}
return new PlainText(resultStrings);
}
}
/**
* Only document can be select
* See: https://github.com/code4craft/webmagic/issues/113
*
* @param elementIterator elementIterator
* @return element element
*/
private Element checkElementAndConvert(ListIterator<Element> elementIterator) {
Element element = elementIterator.next();
if (!(element instanceof Document)) {
Document root = new Document(element.ownerDocument().baseUri());
Element clone = element.clone();
root.appendChild(clone);
elementIterator.set(root);
return root;
}
return element;
}
@Override
public Selectable $(String selector) {
ElementsUtil elementsUtil = new ElementsUtil();
CssSelector cssSelector = Selectors.$(selector);
return elementsUtil.selectElements(cssSelector);
return selectElements(cssSelector);
}
@Override
public Selectable $(String selector, String attrName) {
ElementsUtil elementsUtil = new ElementsUtil();
CssSelector cssSelector = Selectors.$(selector, attrName);
return elementsUtil.selectElements(cssSelector);
return selectElements(cssSelector);
}
@Override

@ -76,27 +76,26 @@ public class ExtractRule {
}
private Selector compileSelector() {
SelectorFactory factory;
switch (expressionType) {
case Css:
factory = new CssSelectorFactory();
break;
if (expressionParams.length >= 1) {
return $(expressionValue, expressionParams[0]);
} else {
return $(expressionValue);
}
case XPath:
factory = new XPathSelectorFactory();
break;
return xpath(expressionValue);
case Regex:
factory = new RegexSelectorFactory();
break;
if (expressionParams.length >= 1) {
return regex(expressionValue, Integer.parseInt(expressionParams[0]));
} else {
return regex(expressionValue);
}
case JsonPath:
factory = new JsonPathSelectorFactory();
break;
return new JsonPathSelector(expressionValue);
default:
factory = new XPathSelectorFactory(); // Default to XPath
return xpath(expressionValue);
}
SelectorCompiler selectorCompiler = new SelectorCompiler(factory);
Selector compiledSelector = selectorCompiler.compileSelector(expressionValue, expressionParams);
return compiledSelector;
}
public void setSelector(Selector selector) {

@ -1,57 +0,0 @@
package us.codecraft.webmagic.configurable;
import us.codecraft.webmagic.selector.JsonPathSelector;
import us.codecraft.webmagic.selector.Selector;
import static us.codecraft.webmagic.selector.Selectors.*;
public interface SelectorFactory {
Selector compileSelector(String expressionValue, String[] expressionParams);
}
class CssSelectorFactory implements SelectorFactory {
@Override
public Selector compileSelector(String expressionValue, String[] expressionParams) {
if (expressionParams.length >= 1) {
return $(expressionValue, expressionParams[0]);
} else {
return $(expressionValue);
}
}
}
class XPathSelectorFactory implements SelectorFactory {
@Override
public Selector compileSelector(String expressionValue, String[] expressionParams) {
return xpath(expressionValue);
}
}
class RegexSelectorFactory implements SelectorFactory {
@Override
public Selector compileSelector(String expressionValue, String[] expressionParams) {
if (expressionParams.length >= 1) {
return regex(expressionValue, Integer.parseInt(expressionParams[0]));
} else {
return regex(expressionValue);
}
}
}
class JsonPathSelectorFactory implements SelectorFactory {
@Override
public Selector compileSelector(String expressionValue, String[] expressionParams) {
return new JsonPathSelector(expressionValue);
}
}
class SelectorCompiler {
private final SelectorFactory selectorFactory;
public SelectorCompiler(SelectorFactory selectorFactory) {
this.selectorFactory = selectorFactory;
}
public Selector compileSelector(String expressionValue, String[] expressionParams) {
return selectorFactory.compileSelector(expressionValue, expressionParams);
}
}
Loading…
Cancel
Save