Merge branch 'release/0.10.0'

pull/835/merge WebMagic-0.10.0
Sutra Zhou 1 year ago
commit 5d55bf33d2

@ -118,9 +118,9 @@ The architecture of webmagic (refered to [Scrapy](http://scrapy.org/))
There are more examples in `webmagic-samples` package. There are more examples in `webmagic-samples` package.
### Lisence: ### License:
Lisenced under [Apache 2.0 lisence](http://opensource.org/licenses/Apache-2.0) Licensed under [Apache 2.0 license](http://opensource.org/licenses/Apache-2.0)
### Thanks: ### Thanks:

@ -1,7 +1,7 @@
<?xml version="1.0" encoding="UTF-8"?> <?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<groupId>us.codecraft</groupId> <groupId>us.codecraft</groupId>
<version>0.9.1</version> <version>0.10.0</version>
<modelVersion>4.0.0</modelVersion> <modelVersion>4.0.0</modelVersion>
<packaging>pom</packaging> <packaging>pom</packaging>
<properties> <properties>

@ -5,7 +5,7 @@
<skin> <skin>
<groupId>org.apache.maven.skins</groupId> <groupId>org.apache.maven.skins</groupId>
<artifactId>maven-fluido-skin</artifactId> <artifactId>maven-fluido-skin</artifactId>
<version>1.9</version> <version>1.11.1</version>
</skin> </skin>
<body> <body>
<menu ref="parent" inherit="top" /> <menu ref="parent" inherit="top" />

@ -3,7 +3,7 @@
<parent> <parent>
<groupId>us.codecraft</groupId> <groupId>us.codecraft</groupId>
<artifactId>webmagic-parent</artifactId> <artifactId>webmagic-parent</artifactId>
<version>0.9.1</version> <version>0.10.0</version>
</parent> </parent>
<modelVersion>4.0.0</modelVersion> <modelVersion>4.0.0</modelVersion>

@ -49,15 +49,34 @@ public class Page {
private byte[] bytes; private byte[] bytes;
private List<Request> targetRequests = new ArrayList<Request>(); private List<Request> targetRequests = new ArrayList<>();
private String charset; private String charset;
public Page() { public Page() {
} }
public static Page fail(){ /**
* Returns a {@link Page} with {@link #downloadSuccess} is {@code false}.
*
* @return the page.
* @deprecated Use {@link #fail(Request)} instead.
*/
@Deprecated
public static Page fail() {
return fail(null);
}
/**
* Returns a {@link Page} with {@link #downloadSuccess} is {@code false},
* and {@link #request} is specified.
*
* @return the page.
* @since 0.10.0
*/
public static Page fail(Request request){
Page page = new Page(); Page page = new Page();
page.setRequest(request);
page.setDownloadSuccess(false); page.setDownloadSuccess(false);
return page; return page;
} }
@ -123,13 +142,7 @@ public class Page {
* @param requests requests * @param requests requests
*/ */
public void addTargetRequests(Iterable<String> requests) { public void addTargetRequests(Iterable<String> requests) {
for (String s : requests) { addTargetRequests(requests, 0); // Default priority is 0
if (StringUtils.isBlank(s) || s.equals("#") || s.startsWith("javascript:")) {
continue;
}
s = UrlUtils.canonicalizeUrl(s, url.toString());
targetRequests.add(new Request(s));
}
} }
/** /**
@ -139,13 +152,32 @@ public class Page {
* @param priority priority * @param priority priority
*/ */
public void addTargetRequests(Iterable<String> requests, long priority) { public void addTargetRequests(Iterable<String> requests, long priority) {
for (String s : requests) { if(requests == null) {
if (StringUtils.isBlank(s) || s.equals("#") || s.startsWith("javascript:")) { return;
continue; }
for (String req : requests) {
addRequestIfValid(req, priority);
} }
s = UrlUtils.canonicalizeUrl(s, url.toString());
targetRequests.add(new Request(s).setPriority(priority));
} }
/**
* Helper method to add a request if it's valid.
*
* @param url URL to add
* @param priority Priority for the URL
*/
private void addRequestIfValid(String url, long priority) {
if (StringUtils.isBlank(url) || url.equals("#") || url.startsWith("javascript:")) {
return;
}
String canonicalizedUrl = UrlUtils.canonicalizeUrl(url, this.url.toString());
Request req = new Request(canonicalizedUrl);
if(priority > 0) {
req.setPriority(priority);
}
targetRequests.add(req);
} }
/** /**

@ -36,26 +36,62 @@ public abstract class AbstractDownloader implements Downloader {
return (Html) page.getHtml(); return (Html) page.getHtml();
} }
/**
* @param request the {@link Request}.
* @deprecated Use {@link #onSuccess(Page, Task)} instead.
*/
@Deprecated @Deprecated
protected void onSuccess(Request request) { protected void onSuccess(Request request) {
} }
/** /**
* @param request the {@link Request}.
* @param task the {@link Task}.
* @since 0.7.6 * @since 0.7.6
* @deprecated Use {@link #onSuccess(Page, Task)} instead.
*/ */
@Deprecated
protected void onSuccess(Request request, Task task) { protected void onSuccess(Request request, Task task) {
this.onSuccess(request); this.onSuccess(request);
} }
/**
* @param page the {@link Page}.
* @param task the {@link Task}.
* @since 0.10.0
*/
protected void onSuccess(Page page, Task task) {
this.onSuccess(page.getRequest(), task);
}
/**
* @param request the {@link Request}.
* @deprecated Use {@link #onError(Page, Task, Throwable)} instead.
*/
@Deprecated @Deprecated
protected void onError(Request request) { protected void onError(Request request) {
} }
/** /**
* @param request the {@link Request}.
* @param task the {@link Task}.
* @param e the exception.
* @since 0.7.6 * @since 0.7.6
* @deprecated Use {@link #onError(Page, Task, Throwable)} instead.
*/ */
@Deprecated
protected void onError(Request request, Task task, Throwable e) { protected void onError(Request request, Task task, Throwable e) {
this.onError(request); this.onError(request);
} }
/**
* @param page the {@link Page}.
* @param task the {@link Task}.
* @param e the exception.
* @since 0.10.0
*/
protected void onError(Page page, Task task, Throwable e) {
this.onError(page.getRequest(), task, e);
}
} }

@ -79,18 +79,18 @@ public class HttpClientDownloader extends AbstractDownloader {
CloseableHttpClient httpClient = getHttpClient(task.getSite()); CloseableHttpClient httpClient = getHttpClient(task.getSite());
Proxy proxy = proxyProvider != null ? proxyProvider.getProxy(request, task) : null; Proxy proxy = proxyProvider != null ? proxyProvider.getProxy(request, task) : null;
HttpClientRequestContext requestContext = httpUriRequestConverter.convert(request, task.getSite(), proxy); HttpClientRequestContext requestContext = httpUriRequestConverter.convert(request, task.getSite(), proxy);
Page page = Page.fail(); Page page = Page.fail(request);
try { try {
httpResponse = httpClient.execute(requestContext.getHttpUriRequest(), requestContext.getHttpClientContext()); httpResponse = httpClient.execute(requestContext.getHttpUriRequest(), requestContext.getHttpClientContext());
page = handleResponse(request, request.getCharset() != null ? request.getCharset() : task.getSite().getCharset(), httpResponse, task); page = handleResponse(request, request.getCharset() != null ? request.getCharset() : task.getSite().getCharset(), httpResponse, task);
onSuccess(request, task); onSuccess(page, task);
logger.info("downloading page success {}", request.getUrl()); logger.info("downloading page success {}", request.getUrl());
return page; return page;
} catch (IOException e) { } catch (IOException e) {
onError(request, task, e); onError(page, task, e);
logger.info("download page {} error", request.getUrl(), e); logger.info("download page {} error", request.getUrl(), e);
return page; return page;

@ -3,6 +3,7 @@ package us.codecraft.webmagic.selector;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.List; import java.util.List;
import org.apache.commons.collections4.CollectionUtils; import org.apache.commons.collections4.CollectionUtils;
/** /**
@ -55,11 +56,12 @@ public abstract class AbstractSelectable implements Selectable {
@Override @Override
public String get() { public String get() {
if (CollectionUtils.isNotEmpty(all())) { List<String> sourceTexts = all();
return all().get(0); if (CollectionUtils.isNotEmpty(sourceTexts)) {
} else { return sourceTexts.get(0);
return null;
} }
return null;
} }
@Override @Override
@ -91,8 +93,9 @@ public abstract class AbstractSelectable implements Selectable {
} }
public String getFirstSourceText() { public String getFirstSourceText() {
if (getSourceTexts() != null && getSourceTexts().size() > 0) { List<String> sourceTexts = getSourceTexts();
return getSourceTexts().get(0); if (CollectionUtils.isNotEmpty(sourceTexts)) {
return sourceTexts.get(0);
} }
return null; return null;
} }
@ -104,6 +107,6 @@ public abstract class AbstractSelectable implements Selectable {
@Override @Override
public boolean match() { public boolean match() {
return getSourceTexts() != null && getSourceTexts().size() > 0; return CollectionUtils.isNotEmpty(getSourceTexts());
} }
} }

@ -6,12 +6,6 @@ package us.codecraft.webmagic.utils;
public abstract class NumberUtils { public abstract class NumberUtils {
public static int compareLong(long o1, long o2) { public static int compareLong(long o1, long o2) {
if (o1 < o2) { return Long.compare(o1, o2);
return -1;
} else if (o1 == o2) {
return 0;
} else {
return 1;
}
} }
} }

@ -21,10 +21,10 @@ public class WMCollections {
} }
public static <T> List<T> newArrayList(T... t){ public static <T> List<T> newArrayList(T... t){
List<T> set = new ArrayList<T>(t.length); List<T> list = new ArrayList<T>(t.length);
for (T t1 : t) { for (T t1 : t) {
set.add(t1); list.add(t1);
} }
return set; return list;
} }
} }

@ -8,7 +8,7 @@
<parent> <parent>
<groupId>us.codecraft</groupId> <groupId>us.codecraft</groupId>
<artifactId>webmagic-parent</artifactId> <artifactId>webmagic-parent</artifactId>
<version>0.9.1</version> <version>0.10.0</version>
</parent> </parent>
<artifactId>webmagic-coverage</artifactId> <artifactId>webmagic-coverage</artifactId>

@ -3,7 +3,7 @@
<parent> <parent>
<groupId>us.codecraft</groupId> <groupId>us.codecraft</groupId>
<artifactId>webmagic-parent</artifactId> <artifactId>webmagic-parent</artifactId>
<version>0.9.1</version> <version>0.10.0</version>
</parent> </parent>
<modelVersion>4.0.0</modelVersion> <modelVersion>4.0.0</modelVersion>

@ -88,7 +88,7 @@ public class PhantomJSDownloader extends AbstractDownloader {
logger.info("downloading page: " + request.getUrl()); logger.info("downloading page: " + request.getUrl());
} }
Page page = Page.fail(); Page page = Page.fail(request);
try { try {
String content = getPage(request); String content = getPage(request);
if (!content.contains("HTTP request failed")) { if (!content.contains("HTTP request failed")) {
@ -98,9 +98,9 @@ public class PhantomJSDownloader extends AbstractDownloader {
page.setRequest(request); page.setRequest(request);
page.setStatusCode(200); page.setStatusCode(200);
} }
onSuccess(request, task); onSuccess(page, task);
} catch (Exception e) { } catch (Exception e) {
onError(request, task, e); onError(page, task, e);
logger.warn("download page {} error", request.getUrl(), e); logger.warn("download page {} error", request.getUrl(), e);
} }
return page; return page;

@ -3,7 +3,7 @@
<parent> <parent>
<artifactId>webmagic-parent</artifactId> <artifactId>webmagic-parent</artifactId>
<groupId>us.codecraft</groupId> <groupId>us.codecraft</groupId>
<version>0.9.1</version> <version>0.10.0</version>
</parent> </parent>
<modelVersion>4.0.0</modelVersion> <modelVersion>4.0.0</modelVersion>

@ -3,7 +3,7 @@
<parent> <parent>
<artifactId>webmagic-parent</artifactId> <artifactId>webmagic-parent</artifactId>
<groupId>us.codecraft</groupId> <groupId>us.codecraft</groupId>
<version>0.9.1</version> <version>0.10.0</version>
</parent> </parent>
<modelVersion>4.0.0</modelVersion> <modelVersion>4.0.0</modelVersion>

@ -3,7 +3,7 @@
<parent> <parent>
<artifactId>webmagic-parent</artifactId> <artifactId>webmagic-parent</artifactId>
<groupId>us.codecraft</groupId> <groupId>us.codecraft</groupId>
<version>0.9.1</version> <version>0.10.0</version>
</parent> </parent>
<modelVersion>4.0.0</modelVersion> <modelVersion>4.0.0</modelVersion>

@ -3,7 +3,7 @@
<parent> <parent>
<artifactId>webmagic-parent</artifactId> <artifactId>webmagic-parent</artifactId>
<groupId>us.codecraft</groupId> <groupId>us.codecraft</groupId>
<version>0.9.1</version> <version>0.10.0</version>
</parent> </parent>
<modelVersion>4.0.0</modelVersion> <modelVersion>4.0.0</modelVersion>

@ -74,7 +74,7 @@ public class SeleniumDownloader extends AbstractDownloader implements Closeable
public Page download(Request request, Task task) { public Page download(Request request, Task task) {
checkInit(); checkInit();
WebDriver webDriver = null; WebDriver webDriver = null;
Page page = Page.fail(); Page page = Page.fail(request);
try { try {
webDriver = webDriverPool.get(); webDriver = webDriverPool.get();
@ -111,10 +111,10 @@ public class SeleniumDownloader extends AbstractDownloader implements Closeable
page.setHtml(new Html(content, request.getUrl())); page.setHtml(new Html(content, request.getUrl()));
page.setUrl(new PlainText(request.getUrl())); page.setUrl(new PlainText(request.getUrl()));
page.setRequest(request); page.setRequest(request);
onSuccess(request, task); onSuccess(page, task);
} catch (Exception e) { } catch (Exception e) {
logger.warn("download page {} error", request.getUrl(), e); logger.warn("download page {} error", request.getUrl(), e);
onError(request, task, e); onError(page, task, e);
} finally { } finally {
if (webDriver != null) { if (webDriver != null) {
webDriverPool.returnToPool(webDriver); webDriverPool.returnToPool(webDriver);

Loading…
Cancel
Save