Merge branch 'release/0.10.0'

pull/835/merge WebMagic-0.10.0
Sutra Zhou 1 year ago
commit 5d55bf33d2

@ -118,9 +118,9 @@ The architecture of webmagic (refered to [Scrapy](http://scrapy.org/))
There are more examples in `webmagic-samples` package.
### Lisence:
### License:
Lisenced under [Apache 2.0 lisence](http://opensource.org/licenses/Apache-2.0)
Licensed under [Apache 2.0 license](http://opensource.org/licenses/Apache-2.0)
### Thanks:

@ -1,7 +1,7 @@
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<groupId>us.codecraft</groupId>
<version>0.9.1</version>
<version>0.10.0</version>
<modelVersion>4.0.0</modelVersion>
<packaging>pom</packaging>
<properties>

@ -5,7 +5,7 @@
<skin>
<groupId>org.apache.maven.skins</groupId>
<artifactId>maven-fluido-skin</artifactId>
<version>1.9</version>
<version>1.11.1</version>
</skin>
<body>
<menu ref="parent" inherit="top" />

@ -3,7 +3,7 @@
<parent>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-parent</artifactId>
<version>0.9.1</version>
<version>0.10.0</version>
</parent>
<modelVersion>4.0.0</modelVersion>

@ -49,15 +49,34 @@ public class Page {
private byte[] bytes;
private List<Request> targetRequests = new ArrayList<Request>();
private List<Request> targetRequests = new ArrayList<>();
private String charset;
public Page() {
}
public static Page fail(){
/**
* Returns a {@link Page} with {@link #downloadSuccess} is {@code false}.
*
* @return the page.
* @deprecated Use {@link #fail(Request)} instead.
*/
@Deprecated
public static Page fail() {
return fail(null);
}
/**
* Returns a {@link Page} with {@link #downloadSuccess} is {@code false},
* and {@link #request} is specified.
*
* @return the page.
* @since 0.10.0
*/
public static Page fail(Request request){
Page page = new Page();
page.setRequest(request);
page.setDownloadSuccess(false);
return page;
}
@ -123,13 +142,7 @@ public class Page {
* @param requests requests
*/
public void addTargetRequests(Iterable<String> requests) {
for (String s : requests) {
if (StringUtils.isBlank(s) || s.equals("#") || s.startsWith("javascript:")) {
continue;
}
s = UrlUtils.canonicalizeUrl(s, url.toString());
targetRequests.add(new Request(s));
}
addTargetRequests(requests, 0); // Default priority is 0
}
/**
@ -139,13 +152,32 @@ public class Page {
* @param priority priority
*/
public void addTargetRequests(Iterable<String> requests, long priority) {
for (String s : requests) {
if (StringUtils.isBlank(s) || s.equals("#") || s.startsWith("javascript:")) {
continue;
}
s = UrlUtils.canonicalizeUrl(s, url.toString());
targetRequests.add(new Request(s).setPriority(priority));
if(requests == null) {
return;
}
for (String req : requests) {
addRequestIfValid(req, priority);
}
}
/**
* Helper method to add a request if it's valid.
*
* @param url URL to add
* @param priority Priority for the URL
*/
private void addRequestIfValid(String url, long priority) {
if (StringUtils.isBlank(url) || url.equals("#") || url.startsWith("javascript:")) {
return;
}
String canonicalizedUrl = UrlUtils.canonicalizeUrl(url, this.url.toString());
Request req = new Request(canonicalizedUrl);
if(priority > 0) {
req.setPriority(priority);
}
targetRequests.add(req);
}
/**

@ -36,26 +36,62 @@ public abstract class AbstractDownloader implements Downloader {
return (Html) page.getHtml();
}
/**
* @param request the {@link Request}.
* @deprecated Use {@link #onSuccess(Page, Task)} instead.
*/
@Deprecated
protected void onSuccess(Request request) {
}
/**
* @param request the {@link Request}.
* @param task the {@link Task}.
* @since 0.7.6
* @deprecated Use {@link #onSuccess(Page, Task)} instead.
*/
@Deprecated
protected void onSuccess(Request request, Task task) {
this.onSuccess(request);
}
/**
* @param page the {@link Page}.
* @param task the {@link Task}.
* @since 0.10.0
*/
protected void onSuccess(Page page, Task task) {
this.onSuccess(page.getRequest(), task);
}
/**
* @param request the {@link Request}.
* @deprecated Use {@link #onError(Page, Task, Throwable)} instead.
*/
@Deprecated
protected void onError(Request request) {
}
/**
* @param request the {@link Request}.
* @param task the {@link Task}.
* @param e the exception.
* @since 0.7.6
* @deprecated Use {@link #onError(Page, Task, Throwable)} instead.
*/
@Deprecated
protected void onError(Request request, Task task, Throwable e) {
this.onError(request);
}
/**
* @param page the {@link Page}.
* @param task the {@link Task}.
* @param e the exception.
* @since 0.10.0
*/
protected void onError(Page page, Task task, Throwable e) {
this.onError(page.getRequest(), task, e);
}
}

@ -79,18 +79,18 @@ public class HttpClientDownloader extends AbstractDownloader {
CloseableHttpClient httpClient = getHttpClient(task.getSite());
Proxy proxy = proxyProvider != null ? proxyProvider.getProxy(request, task) : null;
HttpClientRequestContext requestContext = httpUriRequestConverter.convert(request, task.getSite(), proxy);
Page page = Page.fail();
Page page = Page.fail(request);
try {
httpResponse = httpClient.execute(requestContext.getHttpUriRequest(), requestContext.getHttpClientContext());
page = handleResponse(request, request.getCharset() != null ? request.getCharset() : task.getSite().getCharset(), httpResponse, task);
onSuccess(request, task);
onSuccess(page, task);
logger.info("downloading page success {}", request.getUrl());
return page;
} catch (IOException e) {
onError(request, task, e);
onError(page, task, e);
logger.info("download page {} error", request.getUrl(), e);
return page;

@ -3,6 +3,7 @@ package us.codecraft.webmagic.selector;
import java.util.ArrayList;
import java.util.List;
import org.apache.commons.collections4.CollectionUtils;
/**
@ -55,11 +56,12 @@ public abstract class AbstractSelectable implements Selectable {
@Override
public String get() {
if (CollectionUtils.isNotEmpty(all())) {
return all().get(0);
} else {
return null;
}
List<String> sourceTexts = all();
if (CollectionUtils.isNotEmpty(sourceTexts)) {
return sourceTexts.get(0);
}
return null;
}
@Override
@ -91,8 +93,9 @@ public abstract class AbstractSelectable implements Selectable {
}
public String getFirstSourceText() {
if (getSourceTexts() != null && getSourceTexts().size() > 0) {
return getSourceTexts().get(0);
List<String> sourceTexts = getSourceTexts();
if (CollectionUtils.isNotEmpty(sourceTexts)) {
return sourceTexts.get(0);
}
return null;
}
@ -104,6 +107,6 @@ public abstract class AbstractSelectable implements Selectable {
@Override
public boolean match() {
return getSourceTexts() != null && getSourceTexts().size() > 0;
return CollectionUtils.isNotEmpty(getSourceTexts());
}
}

@ -6,12 +6,6 @@ package us.codecraft.webmagic.utils;
public abstract class NumberUtils {
public static int compareLong(long o1, long o2) {
if (o1 < o2) {
return -1;
} else if (o1 == o2) {
return 0;
} else {
return 1;
}
return Long.compare(o1, o2);
}
}

@ -21,10 +21,10 @@ public class WMCollections {
}
public static <T> List<T> newArrayList(T... t){
List<T> set = new ArrayList<T>(t.length);
List<T> list = new ArrayList<T>(t.length);
for (T t1 : t) {
set.add(t1);
list.add(t1);
}
return set;
return list;
}
}

@ -8,7 +8,7 @@
<parent>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-parent</artifactId>
<version>0.9.1</version>
<version>0.10.0</version>
</parent>
<artifactId>webmagic-coverage</artifactId>

@ -3,7 +3,7 @@
<parent>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-parent</artifactId>
<version>0.9.1</version>
<version>0.10.0</version>
</parent>
<modelVersion>4.0.0</modelVersion>

@ -88,7 +88,7 @@ public class PhantomJSDownloader extends AbstractDownloader {
logger.info("downloading page: " + request.getUrl());
}
Page page = Page.fail();
Page page = Page.fail(request);
try {
String content = getPage(request);
if (!content.contains("HTTP request failed")) {
@ -98,9 +98,9 @@ public class PhantomJSDownloader extends AbstractDownloader {
page.setRequest(request);
page.setStatusCode(200);
}
onSuccess(request, task);
onSuccess(page, task);
} catch (Exception e) {
onError(request, task, e);
onError(page, task, e);
logger.warn("download page {} error", request.getUrl(), e);
}
return page;

@ -3,7 +3,7 @@
<parent>
<artifactId>webmagic-parent</artifactId>
<groupId>us.codecraft</groupId>
<version>0.9.1</version>
<version>0.10.0</version>
</parent>
<modelVersion>4.0.0</modelVersion>

@ -3,7 +3,7 @@
<parent>
<artifactId>webmagic-parent</artifactId>
<groupId>us.codecraft</groupId>
<version>0.9.1</version>
<version>0.10.0</version>
</parent>
<modelVersion>4.0.0</modelVersion>

@ -3,7 +3,7 @@
<parent>
<artifactId>webmagic-parent</artifactId>
<groupId>us.codecraft</groupId>
<version>0.9.1</version>
<version>0.10.0</version>
</parent>
<modelVersion>4.0.0</modelVersion>

@ -3,7 +3,7 @@
<parent>
<artifactId>webmagic-parent</artifactId>
<groupId>us.codecraft</groupId>
<version>0.9.1</version>
<version>0.10.0</version>
</parent>
<modelVersion>4.0.0</modelVersion>

@ -74,7 +74,7 @@ public class SeleniumDownloader extends AbstractDownloader implements Closeable
public Page download(Request request, Task task) {
checkInit();
WebDriver webDriver = null;
Page page = Page.fail();
Page page = Page.fail(request);
try {
webDriver = webDriverPool.get();
@ -111,10 +111,10 @@ public class SeleniumDownloader extends AbstractDownloader implements Closeable
page.setHtml(new Html(content, request.getUrl()));
page.setUrl(new PlainText(request.getUrl()));
page.setRequest(request);
onSuccess(request, task);
onSuccess(page, task);
} catch (Exception e) {
logger.warn("download page {} error", request.getUrl(), e);
onError(request, task, e);
onError(page, task, e);
} finally {
if (webDriver != null) {
webDriverPool.returnToPool(webDriver);

Loading…
Cancel
Save