Merge branch 'release/1.0.2'

pull/1181/head WebMagic-1.0.2
Joe Zhou 2 months ago
commit 837253cfc9

@ -50,7 +50,7 @@ WebMagic use slf4j with slf4j-log4j12 implementation. If you customized your slf
### First crawler: ### First crawler:
Write a class implements PageProcessor. For example, I wrote a crawler of github repository infomation. Write a class implements PageProcessor. For example, I wrote a crawler of github repository information.
```java ```java
public class GithubRepoPageProcessor implements PageProcessor { public class GithubRepoPageProcessor implements PageProcessor {
@ -112,7 +112,7 @@ public class GithubRepo {
Documents: [http://webmagic.io/docs/](http://webmagic.io/docs/) Documents: [http://webmagic.io/docs/](http://webmagic.io/docs/)
The architecture of webmagic (refered to [Scrapy](http://scrapy.org/)) The architecture of webmagic (referred to [Scrapy](http://scrapy.org/))
![image](http://code4craft.github.io/images/posts/webmagic.png) ![image](http://code4craft.github.io/images/posts/webmagic.png)

@ -12,7 +12,7 @@
<version>2.2.1</version> <version>2.2.1</version>
</parent> </parent>
<groupId>us.codecraft</groupId> <groupId>us.codecraft</groupId>
<version>1.0.1</version> <version>1.0.2</version>
<packaging>pom</packaging> <packaging>pom</packaging>
<properties> <properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>

@ -8,7 +8,7 @@
<parent> <parent>
<groupId>us.codecraft</groupId> <groupId>us.codecraft</groupId>
<artifactId>webmagic</artifactId> <artifactId>webmagic</artifactId>
<version>1.0.1</version> <version>1.0.2</version>
</parent> </parent>
<modelVersion>4.0.0</modelVersion> <modelVersion>4.0.0</modelVersion>

@ -52,9 +52,44 @@ public class Page {
private String charset; private String charset;
/**
* Returns a {@link Page} with {@link #downloadSuccess} is {@code true},
* and {@link #request} is specified.
*
* @param request the request.
* @since 1.0.2
*/
public static Page ofSuccess(Request request) {
return new Page(request, true);
}
/**
* Returns a {@link Page} with {@link #downloadSuccess} is {@code true},
* and {@link #request} is specified.
*
* @param request the request.
* @since 1.0.2
*/
public static Page ofFailure(Request request) {
return new Page(request, false);
}
public Page() { public Page() {
} }
/**
* Constructs a {@link Page} with {@link #request}
* and {@link #downloadSuccess} specified.
*
* @param request the request.
* @param downloadSuccess the download success flag.
* @since 1.0.2
*/
private Page(Request request, boolean downloadSuccess) {
this.request = request;
this.downloadSuccess = downloadSuccess;
}
/** /**
* Returns a {@link Page} with {@link #downloadSuccess} is {@code false}. * Returns a {@link Page} with {@link #downloadSuccess} is {@code false}.
* *
@ -73,7 +108,9 @@ public class Page {
* @param request the {@link Request}. * @param request the {@link Request}.
* @return the page. * @return the page.
* @since 0.10.0 * @since 0.10.0
* @deprecated Use {@link #ofFailure(Request)} instead.
*/ */
@Deprecated(since = "1.0.2", forRemoval = true)
public static Page fail(Request request){ public static Page fail(Request request){
Page page = new Page(); Page page = new Page();
page.setRequest(request); page.setRequest(request);

@ -76,13 +76,14 @@ public class HttpClientDownloader extends AbstractDownloader {
CloseableHttpClient httpClient = getHttpClient(task.getSite()); CloseableHttpClient httpClient = getHttpClient(task.getSite());
Proxy proxy = proxyProvider != null ? proxyProvider.getProxy(request, task) : null; Proxy proxy = proxyProvider != null ? proxyProvider.getProxy(request, task) : null;
HttpClientRequestContext requestContext = httpUriRequestConverter.convert(request, task.getSite(), proxy); HttpClientRequestContext requestContext = httpUriRequestConverter.convert(request, task.getSite(), proxy);
Page page = Page.fail(request); Page page = null;
try { try {
httpResponse = httpClient.execute(requestContext.getHttpUriRequest(), requestContext.getHttpClientContext()); httpResponse = httpClient.execute(requestContext.getHttpUriRequest(), requestContext.getHttpClientContext());
page = handleResponse(request, request.getCharset() != null ? request.getCharset() : task.getSite().getCharset(), httpResponse, task); page = handleResponse(request, request.getCharset() != null ? request.getCharset() : task.getSite().getCharset(), httpResponse, task);
onSuccess(page, task); onSuccess(page, task);
return page; return page;
} catch (IOException e) { } catch (IOException e) {
page = Page.ofFailure(request);
onError(page, task, e); onError(page, task, e);
return page; return page;
} finally { } finally {
@ -105,7 +106,7 @@ public class HttpClientDownloader extends AbstractDownloader {
HttpEntity entity = httpResponse.getEntity(); HttpEntity entity = httpResponse.getEntity();
byte[] bytes = entity != null ? IOUtils.toByteArray(entity.getContent()) : new byte[0]; byte[] bytes = entity != null ? IOUtils.toByteArray(entity.getContent()) : new byte[0];
String contentType = entity != null && entity.getContentType() != null ? entity.getContentType().getValue() : null; String contentType = entity != null && entity.getContentType() != null ? entity.getContentType().getValue() : null;
Page page = new Page(); Page page = Page.ofSuccess(request);
page.setBytes(bytes); page.setBytes(bytes);
if (!request.isBinaryContent()) { if (!request.isBinaryContent()) {
if (charset == null) { if (charset == null) {
@ -117,7 +118,6 @@ public class HttpClientDownloader extends AbstractDownloader {
page.setUrl(new PlainText(request.getUrl())); page.setUrl(new PlainText(request.getUrl()));
page.setRequest(request); page.setRequest(request);
page.setStatusCode(httpResponse.getStatusLine().getStatusCode()); page.setStatusCode(httpResponse.getStatusLine().getStatusCode());
page.setDownloadSuccess(true);
if (responseHeader) { if (responseHeader) {
page.setHeaders(HttpClientUtils.convertHeaders(httpResponse.getAllHeaders())); page.setHeaders(HttpClientUtils.convertHeaders(httpResponse.getAllHeaders()));
} }

@ -10,7 +10,7 @@
<parent> <parent>
<groupId>us.codecraft</groupId> <groupId>us.codecraft</groupId>
<artifactId>webmagic</artifactId> <artifactId>webmagic</artifactId>
<version>1.0.1</version> <version>1.0.2</version>
</parent> </parent>
<artifactId>webmagic-coverage</artifactId> <artifactId>webmagic-coverage</artifactId>

@ -8,7 +8,7 @@
<parent> <parent>
<groupId>us.codecraft</groupId> <groupId>us.codecraft</groupId>
<artifactId>webmagic</artifactId> <artifactId>webmagic</artifactId>
<version>1.0.1</version> <version>1.0.2</version>
</parent> </parent>
<modelVersion>4.0.0</modelVersion> <modelVersion>4.0.0</modelVersion>

@ -8,7 +8,7 @@
<parent> <parent>
<groupId>us.codecraft</groupId> <groupId>us.codecraft</groupId>
<artifactId>webmagic</artifactId> <artifactId>webmagic</artifactId>
<version>1.0.1</version> <version>1.0.2</version>
</parent> </parent>
<modelVersion>4.0.0</modelVersion> <modelVersion>4.0.0</modelVersion>

@ -8,7 +8,7 @@
<parent> <parent>
<groupId>us.codecraft</groupId> <groupId>us.codecraft</groupId>
<artifactId>webmagic</artifactId> <artifactId>webmagic</artifactId>
<version>1.0.1</version> <version>1.0.2</version>
</parent> </parent>
<modelVersion>4.0.0</modelVersion> <modelVersion>4.0.0</modelVersion>

@ -8,7 +8,7 @@
<parent> <parent>
<groupId>us.codecraft</groupId> <groupId>us.codecraft</groupId>
<artifactId>webmagic</artifactId> <artifactId>webmagic</artifactId>
<version>1.0.1</version> <version>1.0.2</version>
</parent> </parent>
<modelVersion>4.0.0</modelVersion> <modelVersion>4.0.0</modelVersion>

@ -8,7 +8,7 @@
<parent> <parent>
<groupId>us.codecraft</groupId> <groupId>us.codecraft</groupId>
<artifactId>webmagic</artifactId> <artifactId>webmagic</artifactId>
<version>1.0.1</version> <version>1.0.2</version>
</parent> </parent>
<modelVersion>4.0.0</modelVersion> <modelVersion>4.0.0</modelVersion>

Loading…
Cancel
Save