diff --git a/LICENSE b/LICENSE
new file mode 100644
index 00000000..0cecd852
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,191 @@
+Apache License
+Version 2.0, January 2004
+http://www.apache.org/licenses/
+
+TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+1. Definitions.
+
+"License" shall mean the terms and conditions for use, reproduction, and
+distribution as defined by Sections 1 through 9 of this document.
+
+"Licensor" shall mean the copyright owner or entity authorized by the copyright
+owner that is granting the License.
+
+"Legal Entity" shall mean the union of the acting entity and all other entities
+that control, are controlled by, or are under common control with that entity.
+For the purposes of this definition, "control" means (i) the power, direct or
+indirect, to cause the direction or management of such entity, whether by
+contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the
+outstanding shares, or (iii) beneficial ownership of such entity.
+
+"You" (or "Your") shall mean an individual or Legal Entity exercising
+permissions granted by this License.
+
+"Source" form shall mean the preferred form for making modifications, including
+but not limited to software source code, documentation source, and configuration
+files.
+
+"Object" form shall mean any form resulting from mechanical transformation or
+translation of a Source form, including but not limited to compiled object code,
+generated documentation, and conversions to other media types.
+
+"Work" shall mean the work of authorship, whether in Source or Object form, made
+available under the License, as indicated by a copyright notice that is included
+in or attached to the work (an example is provided in the Appendix below).
+
+"Derivative Works" shall mean any work, whether in Source or Object form, that
+is based on (or derived from) the Work and for which the editorial revisions,
+annotations, elaborations, or other modifications represent, as a whole, an
+original work of authorship. For the purposes of this License, Derivative Works
+shall not include works that remain separable from, or merely link (or bind by
+name) to the interfaces of, the Work and Derivative Works thereof.
+
+"Contribution" shall mean any work of authorship, including the original version
+of the Work and any modifications or additions to that Work or Derivative Works
+thereof, that is intentionally submitted to Licensor for inclusion in the Work
+by the copyright owner or by an individual or Legal Entity authorized to submit
+on behalf of the copyright owner. For the purposes of this definition,
+"submitted" means any form of electronic, verbal, or written communication sent
+to the Licensor or its representatives, including but not limited to
+communication on electronic mailing lists, source code control systems, and
+issue tracking systems that are managed by, or on behalf of, the Licensor for
+the purpose of discussing and improving the Work, but excluding communication
+that is conspicuously marked or otherwise designated in writing by the copyright
+owner as "Not a Contribution."
+
+"Contributor" shall mean Licensor and any individual or Legal Entity on behalf
+of whom a Contribution has been received by Licensor and subsequently
+incorporated within the Work.
+
+2. Grant of Copyright License.
+
+Subject to the terms and conditions of this License, each Contributor hereby
+grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free,
+irrevocable copyright license to reproduce, prepare Derivative Works of,
+publicly display, publicly perform, sublicense, and distribute the Work and such
+Derivative Works in Source or Object form.
+
+3. Grant of Patent License.
+
+Subject to the terms and conditions of this License, each Contributor hereby
+grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free,
+irrevocable (except as stated in this section) patent license to make, have
+made, use, offer to sell, sell, import, and otherwise transfer the Work, where
+such license applies only to those patent claims licensable by such Contributor
+that are necessarily infringed by their Contribution(s) alone or by combination
+of their Contribution(s) with the Work to which such Contribution(s) was
+submitted. If You institute patent litigation against any entity (including a
+cross-claim or counterclaim in a lawsuit) alleging that the Work or a
+Contribution incorporated within the Work constitutes direct or contributory
+patent infringement, then any patent licenses granted to You under this License
+for that Work shall terminate as of the date such litigation is filed.
+
+4. Redistribution.
+
+You may reproduce and distribute copies of the Work or Derivative Works thereof
+in any medium, with or without modifications, and in Source or Object form,
+provided that You meet the following conditions:
+
+You must give any other recipients of the Work or Derivative Works a copy of
+this License; and
+You must cause any modified files to carry prominent notices stating that You
+changed the files; and
+You must retain, in the Source form of any Derivative Works that You distribute,
+all copyright, patent, trademark, and attribution notices from the Source form
+of the Work, excluding those notices that do not pertain to any part of the
+Derivative Works; and
+If the Work includes a "NOTICE" text file as part of its distribution, then any
+Derivative Works that You distribute must include a readable copy of the
+attribution notices contained within such NOTICE file, excluding those notices
+that do not pertain to any part of the Derivative Works, in at least one of the
+following places: within a NOTICE text file distributed as part of the
+Derivative Works; within the Source form or documentation, if provided along
+with the Derivative Works; or, within a display generated by the Derivative
+Works, if and wherever such third-party notices normally appear. The contents of
+the NOTICE file are for informational purposes only and do not modify the
+License. You may add Your own attribution notices within Derivative Works that
+You distribute, alongside or as an addendum to the NOTICE text from the Work,
+provided that such additional attribution notices cannot be construed as
+modifying the License.
+You may add Your own copyright statement to Your modifications and may provide
+additional or different license terms and conditions for use, reproduction, or
+distribution of Your modifications, or for any such Derivative Works as a whole,
+provided Your use, reproduction, and distribution of the Work otherwise complies
+with the conditions stated in this License.
+
+5. Submission of Contributions.
+
+Unless You explicitly state otherwise, any Contribution intentionally submitted
+for inclusion in the Work by You to the Licensor shall be under the terms and
+conditions of this License, without any additional terms or conditions.
+Notwithstanding the above, nothing herein shall supersede or modify the terms of
+any separate license agreement you may have executed with Licensor regarding
+such Contributions.
+
+6. Trademarks.
+
+This License does not grant permission to use the trade names, trademarks,
+service marks, or product names of the Licensor, except as required for
+reasonable and customary use in describing the origin of the Work and
+reproducing the content of the NOTICE file.
+
+7. Disclaimer of Warranty.
+
+Unless required by applicable law or agreed to in writing, Licensor provides the
+Work (and each Contributor provides its Contributions) on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied,
+including, without limitation, any warranties or conditions of TITLE,
+NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are
+solely responsible for determining the appropriateness of using or
+redistributing the Work and assume any risks associated with Your exercise of
+permissions under this License.
+
+8. Limitation of Liability.
+
+In no event and under no legal theory, whether in tort (including negligence),
+contract, or otherwise, unless required by applicable law (such as deliberate
+and grossly negligent acts) or agreed to in writing, shall any Contributor be
+liable to You for damages, including any direct, indirect, special, incidental,
+or consequential damages of any character arising as a result of this License or
+out of the use or inability to use the Work (including but not limited to
+damages for loss of goodwill, work stoppage, computer failure or malfunction, or
+any and all other commercial damages or losses), even if such Contributor has
+been advised of the possibility of such damages.
+
+9. Accepting Warranty or Additional Liability.
+
+While redistributing the Work or Derivative Works thereof, You may choose to
+offer, and charge a fee for, acceptance of support, warranty, indemnity, or
+other liability obligations and/or rights consistent with this License. However,
+in accepting such obligations, You may act only on Your own behalf and on Your
+sole responsibility, not on behalf of any other Contributor, and only if You
+agree to indemnify, defend, and hold each Contributor harmless for any liability
+incurred by, or claims asserted against, such Contributor by reason of your
+accepting any such warranty or additional liability.
+
+END OF TERMS AND CONDITIONS
+
+APPENDIX: How to apply the Apache License to your work
+
+To apply the Apache License to your work, attach the following boilerplate
+notice, with the fields enclosed by brackets "{}" replaced with your own
+identifying information. (Don't include the brackets!) The text should be
+enclosed in the appropriate comment syntax for the file format. We also
+recommend that a file or class name and description of purpose be included on
+the same "printed page" as the copyright notice for easier identification within
+third-party archives.
+
+ Copyright 2013 code4craft
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
\ No newline at end of file
diff --git a/README-zh.md b/README-zh.md
index d69dd63f..e8f07355 100644
--- a/README-zh.md
+++ b/README-zh.md
@@ -168,30 +168,6 @@ webmagic的使用可以参考:[oschina openapi 应用:博客搬家](http://m
webmagic遵循[Apache 2.0协议](http://opensource.org/licenses/Apache-2.0)
-### 贡献者:
-
-以下是为WebMagic提交过代码或者issue的朋友:
-
-* [ccliangbo](https://github.com/ccliangbo)
-* [yuany](https://github.com/yuany)
-* [yxssfxwzy](https://github.com/yxssfxwzy)
-* [linkerlin](https://github.com/linkerlin)
-* [d0ngw](https://github.com/d0ngw)
-* [xuchaoo](https://github.com/xuchaoo)
-* [supermicah](https://github.com/supermicah)
-* [SimpleExpress](https://github.com/SimpleExpress)
-* [aruanruan](https://github.com/aruanruan)
-* [l1z2g9](https://github.com/l1z2g9)
-* [zhegexiaohuozi](https://github.com/zhegexiaohuozi)
-* [ywooer](https://github.com/ywooer)
-* [yyw258520](https://github.com/yyw258520)
-* [perfecking](https://github.com/perfecking)
-* [lidongyang](http://my.oschina.net/lidongyang)
-* [seveniu](https://github.com/seveniu)
-* [sebastian1118](https://github.com/sebastian1118)
-* [codev777](https://github.com/codev777)
-* [fengwuze](https://github.com/fengwuze)
-
### 邮件组:
Gmail:
diff --git a/README.md b/README.md
index 285eb609..87858443 100644
--- a/README.md
+++ b/README.md
@@ -59,7 +59,7 @@ public class GithubRepoPageProcessor implements PageProcessor {
public void process(Page page) {
page.addTargetRequests(page.getHtml().links().regex("(https://github\\.com/\\w+/\\w+)").all());
page.putField("author", page.getUrl().regex("https://github\\.com/(\\w+)/.*").toString());
- page.putField("name", page.getHtml().xpath("//h1[@class='entry-title public']/strong/a/text()").toString());
+ page.putField("name", page.getHtml().xpath("//h1[@class='public']/strong/a/text()").toString());
if (page.getResultItems().get("name")==null){
//skip this page
page.setSkip(true);
@@ -89,7 +89,7 @@ You can also use annotation way:
@HelpUrl("https://github.com/\\w+")
public class GithubRepo {
- @ExtractBy(value = "//h1[@class='entry-title public']/strong/a/text()", notNull = true)
+ @ExtractBy(value = "//h1[@class='public']/strong/a/text()", notNull = true)
private String name;
@ExtractByUrl("https://github\\.com/(\\w+)/.*")
@@ -114,39 +114,12 @@ The architecture of webmagic (refered to [Scrapy](http://scrapy.org/))

-Javadocs: [http://code4craft.github.io/webmagic/docs/en/](http://code4craft.github.io/webmagic/docs/en/)
-
-There are some samples in `webmagic-samples` package.
+There are more examples in `webmagic-samples` package.
### Lisence:
Lisenced under [Apache 2.0 lisence](http://opensource.org/licenses/Apache-2.0)
-### Contributors:
-
-Thanks these people for commiting source code, reporting bugs or suggesting for new feature:
-
-* [ccliangbo](https://github.com/ccliangbo)
-* [yuany](https://github.com/yuany)
-* [yxssfxwzy](https://github.com/yxssfxwzy)
-* [linkerlin](https://github.com/linkerlin)
-* [d0ngw](https://github.com/d0ngw)
-* [xuchaoo](https://github.com/xuchaoo)
-* [supermicah](https://github.com/supermicah)
-* [SimpleExpress](https://github.com/SimpleExpress)
-* [aruanruan](https://github.com/aruanruan)
-* [l1z2g9](https://github.com/l1z2g9)
-* [zhegexiaohuozi](https://github.com/zhegexiaohuozi)
-* [ywooer](https://github.com/ywooer)
-* [yyw258520](https://github.com/yyw258520)
-* [perfecking](https://github.com/perfecking)
-* [lidongyang](http://my.oschina.net/lidongyang)
-* [seveniu](https://github.com/seveniu)
-* [sebastian1118](https://github.com/sebastian1118)
-* [codev777](https://github.com/codev777)
-* [fengwuze](https://github.com/fengwuze)
-
-
### Thanks:
To write webmagic, I refered to the projects below :
diff --git a/pom.xml b/pom.xml
index 0743c02b..04b6dec0 100644
--- a/pom.xml
+++ b/pom.xml
@@ -64,6 +64,12 @@
4.11
test
+
+ org.mockito
+ mockito-all
+ 1.10.19
+ test
+
org.apache.httpcomponents
httpclient
@@ -97,7 +103,7 @@
com.alibaba
fastjson
- 1.2.21
+ 1.2.28
com.github.dreamhead
@@ -130,7 +136,7 @@
commons-collections
commons-collections
- 3.2.1
+ 3.2.2
org.apache.commons
diff --git a/webmagic-core/pom.xml b/webmagic-core/pom.xml
index fbd5034c..ad969612 100644
--- a/webmagic-core/pom.xml
+++ b/webmagic-core/pom.xml
@@ -40,6 +40,11 @@
slf4j-api
+
+ org.mockito
+ mockito-all
+
+
org.slf4j
slf4j-log4j12
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java
index 62f21f8e..7c0064d1 100644
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java
@@ -107,14 +107,12 @@ public class Page {
* @param requests requests
*/
public void addTargetRequests(List requests) {
- synchronized (targetRequests) {
- for (String s : requests) {
- if (StringUtils.isBlank(s) || s.equals("#") || s.startsWith("javascript:")) {
- continue;
- }
- s = UrlUtils.canonicalizeUrl(s, url.toString());
- targetRequests.add(new Request(s));
+ for (String s : requests) {
+ if (StringUtils.isBlank(s) || s.equals("#") || s.startsWith("javascript:")) {
+ continue;
}
+ s = UrlUtils.canonicalizeUrl(s, url.toString());
+ targetRequests.add(new Request(s));
}
}
@@ -125,14 +123,12 @@ public class Page {
* @param priority priority
*/
public void addTargetRequests(List requests, long priority) {
- synchronized (targetRequests) {
- for (String s : requests) {
- if (StringUtils.isBlank(s) || s.equals("#") || s.startsWith("javascript:")) {
- continue;
- }
- s = UrlUtils.canonicalizeUrl(s, url.toString());
- targetRequests.add(new Request(s).setPriority(priority));
+ for (String s : requests) {
+ if (StringUtils.isBlank(s) || s.equals("#") || s.startsWith("javascript:")) {
+ continue;
}
+ s = UrlUtils.canonicalizeUrl(s, url.toString());
+ targetRequests.add(new Request(s).setPriority(priority));
}
}
@@ -145,10 +141,8 @@ public class Page {
if (StringUtils.isBlank(requestString) || requestString.equals("#")) {
return;
}
- synchronized (targetRequests) {
- requestString = UrlUtils.canonicalizeUrl(requestString, url.toString());
- targetRequests.add(new Request(requestString));
- }
+ requestString = UrlUtils.canonicalizeUrl(requestString, url.toString());
+ targetRequests.add(new Request(requestString));
}
/**
@@ -157,9 +151,7 @@ public class Page {
* @param request request
*/
public void addTargetRequest(Request request) {
- synchronized (targetRequests) {
- targetRequests.add(request);
- }
+ targetRequests.add(request);
}
/**
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java
index 4d7f4270..c8c59782 100644
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java
@@ -85,27 +85,10 @@ public class Request implements Serializable {
return url;
}
- @Override
- public boolean equals(Object o) {
- if (this == o) return true;
- if (o == null || getClass() != o.getClass()) return false;
-
- Request request = (Request) o;
-
- if (!url.equals(request.url)) return false;
-
- return true;
- }
-
public Map getExtras() {
return extras;
}
- @Override
- public int hashCode() {
- return url.hashCode();
- }
-
public void setExtras(Map extras) {
this.extras = extras;
}
@@ -132,23 +115,52 @@ public class Request implements Serializable {
return params;
}
/**
- * POST/GET参数设置
+ * set params for request
+ *
+ * DO NOT set this for request already has params, like 'https://github.com/search?q=webmagic'
+ * @param params params
* */
public void setParams(Map params) {
this.params = params;
}
/**
- * POST/GET参数设置
+ * set params for request
+ *
+ * DO NOT set this for request already has params, like 'https://github.com/search?q=webmagic'
+ * @param key key
+ * @param value value
* */
public void putParams(String key,String value) {
params.put(key,value);
}
+
+ @Override
+ public boolean equals(Object o) {
+ if (this == o) return true;
+ if (o == null || getClass() != o.getClass()) return false;
+
+ Request request = (Request) o;
+
+ if (url != null ? !url.equals(request.url) : request.url != null) return false;
+ if (method != null ? !method.equals(request.method) : request.method != null) return false;
+ return params != null ? params.equals(request.params) : request.params == null;
+ }
+
+ @Override
+ public int hashCode() {
+ int result = url != null ? url.hashCode() : 0;
+ result = 31 * result + (method != null ? method.hashCode() : 0);
+ result = 31 * result + (params != null ? params.hashCode() : 0);
+ return result;
+ }
+
@Override
public String toString() {
return "Request{" +
"url='" + url + '\'' +
", method='" + method + '\'' +
", extras=" + extras +
+ ", params=" + params +
", priority=" + priority +
'}';
}
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java
index b1afb660..49734b7e 100644
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java
@@ -305,7 +305,7 @@ public class Spider implements Runnable, Task {
initComponent();
logger.info("Spider " + getUUID() + " started!");
while (!Thread.currentThread().isInterrupted() && stat.get() == STAT_RUNNING) {
- Request request = scheduler.poll(this);
+ final Request request = scheduler.poll(this);
if (request == null) {
if (threadPool.getThreadAlive() == 0 && exitWhenComplete) {
break;
@@ -313,16 +313,15 @@ public class Spider implements Runnable, Task {
// wait until new url added
waitNewUrl();
} else {
- final Request requestFinal = request;
threadPool.execute(new Runnable() {
@Override
public void run() {
try {
- processRequest(requestFinal);
- onSuccess(requestFinal);
+ processRequest(request);
+ onSuccess(request);
} catch (Exception e) {
- onError(requestFinal);
- logger.error("process request " + requestFinal + " error", e);
+ onError(request);
+ logger.error("process request " + request + " error", e);
} finally {
pageCount.incrementAndGet();
signalNewUrl();
@@ -587,6 +586,7 @@ public class Spider implements Runnable, Task {
if (threadNum <= 0) {
throw new IllegalArgumentException("threadNum should be more than one!");
}
+ this.executorService = executorService;
return this;
}
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java
index 6c1e89c0..9e77ef5f 100644
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java
@@ -1,7 +1,6 @@
package us.codecraft.webmagic.downloader;
import org.apache.commons.io.IOUtils;
-import org.apache.commons.lang3.StringUtils;
import org.apache.http.HttpHost;
import org.apache.http.HttpResponse;
import org.apache.http.NameValuePair;
@@ -15,10 +14,6 @@ import org.apache.http.client.methods.RequestBuilder;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.message.BasicNameValuePair;
import org.apache.http.util.EntityUtils;
-import org.jsoup.Jsoup;
-import org.jsoup.nodes.Document;
-import org.jsoup.nodes.Element;
-import org.jsoup.select.Elements;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import us.codecraft.webmagic.Page;
@@ -27,8 +22,8 @@ import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.proxy.Proxy;
import us.codecraft.webmagic.selector.PlainText;
+import us.codecraft.webmagic.utils.CharsetUtils;
import us.codecraft.webmagic.utils.HttpConstant;
-import us.codecraft.webmagic.utils.UrlUtils;
import us.codecraft.webmagic.utils.WMCollections;
import java.io.IOException;
@@ -98,8 +93,8 @@ public class HttpClientDownloader extends AbstractDownloader {
proxyHost = site.getHttpProxy();
}
- HttpUriRequest httpUriRequest = getHttpUriRequest(request, site, headers, proxyHost);//���������˴���
- httpResponse = getHttpClient(site, proxy).execute(httpUriRequest);//getHttpClient�������˴�����֤
+ HttpUriRequest httpUriRequest = getHttpUriRequest(request, site, headers, proxyHost);
+ httpResponse = getHttpClient(site, proxy).execute(httpUriRequest);
statusCode = httpResponse.getStatusLine().getStatusCode();
request.putExtra(Request.STATUS_CODE, statusCode);
if (statusAccept(acceptStatCode, statusCode)) {
@@ -167,39 +162,44 @@ public class HttpClientDownloader extends AbstractDownloader {
String method = request.getMethod();
if (method == null || method.equalsIgnoreCase(HttpConstant.Method.GET)) {
//default get
- RequestBuilder requestBuilder=RequestBuilder.get();
- if (request.getParams() != null) {
- for (Map.Entry entry : request.getParams().entrySet()) {
- requestBuilder.addParameter(entry.getKey(), entry.getValue());
- }
- }
- return requestBuilder;
+ return addQueryParams(RequestBuilder.get(),request.getParams());
} else if (method.equalsIgnoreCase(HttpConstant.Method.POST)) {
- RequestBuilder requestBuilder = RequestBuilder.post();
- NameValuePair[] nameValuePair = (NameValuePair[]) request.getExtra("nameValuePair");
- List allNameValuePair=new ArrayList();
- if (nameValuePair != null && nameValuePair.length > 0) {
- allNameValuePair= Arrays.asList(nameValuePair);
- }
- if (request.getParams() != null) {
- for (String key : request.getParams().keySet()) {
- allNameValuePair.add(new BasicNameValuePair(key, request.getParams().get(key)));
- }
- }
- requestBuilder.setEntity(new UrlEncodedFormEntity(allNameValuePair, Charset.forName("utf8")));
- return requestBuilder;
+ return addFormParams(RequestBuilder.post(), (NameValuePair[]) request.getExtra("nameValuePair"), request.getParams());
} else if (method.equalsIgnoreCase(HttpConstant.Method.HEAD)) {
- return RequestBuilder.head();
+ return addQueryParams(RequestBuilder.head(),request.getParams());
} else if (method.equalsIgnoreCase(HttpConstant.Method.PUT)) {
- return RequestBuilder.put();
+ return addFormParams(RequestBuilder.put(), (NameValuePair[]) request.getExtra("nameValuePair"), request.getParams());
} else if (method.equalsIgnoreCase(HttpConstant.Method.DELETE)) {
- return RequestBuilder.delete();
+ return addQueryParams(RequestBuilder.delete(),request.getParams());
} else if (method.equalsIgnoreCase(HttpConstant.Method.TRACE)) {
- return RequestBuilder.trace();
+ return addQueryParams(RequestBuilder.trace(),request.getParams());
}
throw new IllegalArgumentException("Illegal HTTP Method " + method);
}
+ private RequestBuilder addFormParams(RequestBuilder requestBuilder, NameValuePair[] nameValuePair, Map params) {
+ List allNameValuePair=new ArrayList();
+ if (nameValuePair != null && nameValuePair.length > 0) {
+ allNameValuePair= Arrays.asList(nameValuePair);
+ }
+ if (params != null) {
+ for (String key : params.keySet()) {
+ allNameValuePair.add(new BasicNameValuePair(key, params.get(key)));
+ }
+ }
+ requestBuilder.setEntity(new UrlEncodedFormEntity(allNameValuePair, Charset.forName("utf8")));
+ return requestBuilder;
+ }
+
+ private RequestBuilder addQueryParams(RequestBuilder requestBuilder, Map params) {
+ if (params != null) {
+ for (Map.Entry entry : params.entrySet()) {
+ requestBuilder.addParameter(entry.getKey(), entry.getValue());
+ }
+ }
+ return requestBuilder;
+ }
+
protected Page handleResponse(Request request, String charset, HttpResponse httpResponse, Task task) throws IOException {
String content = getContent(charset, httpResponse);
Page page = new Page();
@@ -226,40 +226,6 @@ public class HttpClientDownloader extends AbstractDownloader {
}
protected String getHtmlCharset(HttpResponse httpResponse, byte[] contentBytes) throws IOException {
- String charset;
- // charset
- // 1、encoding in http header Content-Type
- String value = httpResponse.getEntity().getContentType().getValue();
- charset = UrlUtils.getCharset(value);
- if (StringUtils.isNotBlank(charset)) {
- logger.debug("Auto get charset: {}", charset);
- return charset;
- }
- // use default charset to decode first time
- Charset defaultCharset = Charset.defaultCharset();
- String content = new String(contentBytes, defaultCharset.name());
- // 2、charset in meta
- if (StringUtils.isNotEmpty(content)) {
- Document document = Jsoup.parse(content);
- Elements links = document.select("meta");
- for (Element link : links) {
- // 2.1、html4.01
- String metaContent = link.attr("content");
- String metaCharset = link.attr("charset");
- if (metaContent.indexOf("charset") != -1) {
- metaContent = metaContent.substring(metaContent.indexOf("charset"), metaContent.length());
- charset = metaContent.split("=")[1];
- break;
- }
- // 2.2、html5
- else if (StringUtils.isNotEmpty(metaCharset)) {
- charset = metaCharset;
- break;
- }
- }
- }
- logger.debug("Auto get charset: {}", charset);
- // 3、todo use tools as cpdetector for content decode
- return charset;
+ return CharsetUtils.detectCharset(httpResponse.getEntity().getContentType().getValue(), contentBytes);
}
}
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/processor/example/GithubRepoPageProcessor.java b/webmagic-core/src/main/java/us/codecraft/webmagic/processor/example/GithubRepoPageProcessor.java
index 955bd5a3..e93ab4cd 100644
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/processor/example/GithubRepoPageProcessor.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/processor/example/GithubRepoPageProcessor.java
@@ -18,7 +18,7 @@ public class GithubRepoPageProcessor implements PageProcessor {
page.addTargetRequests(page.getHtml().links().regex("(https://github\\.com/[\\w\\-]+/[\\w\\-]+)").all());
page.addTargetRequests(page.getHtml().links().regex("(https://github\\.com/[\\w\\-])").all());
page.putField("author", page.getUrl().regex("https://github\\.com/(\\w+)/.*").toString());
- page.putField("name", page.getHtml().xpath("//h1[@class='entry-title public']/strong/a/text()").toString());
+ page.putField("name", page.getHtml().xpath("//h1[@class='public']/strong/a/text()").toString());
if (page.getResultItems().get("name")==null){
//skip this page
page.setSkip(true);
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/Proxy.java b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/Proxy.java
index 26096715..dbe3a182 100644
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/Proxy.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/Proxy.java
@@ -79,14 +79,14 @@ public class Proxy implements Delayed, Serializable {
private List failedErrorType = new ArrayList();
- Proxy(HttpHost httpHost, String user, String password) {
+ public Proxy(HttpHost httpHost, String user, String password) {
this.httpHost = httpHost;
this.user = user;
this.password = password;
this.canReuseTime = System.nanoTime() + TimeUnit.NANOSECONDS.convert(reuseTimeInterval, TimeUnit.MILLISECONDS);
}
- Proxy(HttpHost httpHost, int reuseInterval, String user, String password) {
+ public Proxy(HttpHost httpHost, int reuseInterval, String user, String password) {
this.httpHost = httpHost;
this.user = user;
this.password = password;
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/DuplicateRemovedScheduler.java b/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/DuplicateRemovedScheduler.java
index 9be7adb5..ecbeecb6 100644
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/DuplicateRemovedScheduler.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/DuplicateRemovedScheduler.java
@@ -6,6 +6,7 @@ import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.scheduler.component.DuplicateRemover;
import us.codecraft.webmagic.scheduler.component.HashSetDuplicateRemover;
+import us.codecraft.webmagic.utils.HttpConstant;
/**
* Remove duplicate urls and only push urls which are not duplicate.
@@ -31,7 +32,7 @@ public abstract class DuplicateRemovedScheduler implements Scheduler {
@Override
public void push(Request request, Task task) {
logger.trace("get a candidate url {}", request.getUrl());
- if (!duplicatedRemover.isDuplicate(request, task) || shouldReserved(request)) {
+ if (shouldReserved(request) || noNeedToRemoveDuplicate(request) || !duplicatedRemover.isDuplicate(request, task)) {
logger.debug("push to queue {}", request.getUrl());
pushWhenNoDuplicate(request, task);
}
@@ -41,6 +42,10 @@ public abstract class DuplicateRemovedScheduler implements Scheduler {
return request.getExtra(Request.CYCLE_TRIED_TIMES) != null;
}
+ protected boolean noNeedToRemoveDuplicate(Request request) {
+ return HttpConstant.Method.POST.equalsIgnoreCase(request.getMethod());
+ }
+
protected void pushWhenNoDuplicate(Request request, Task task) {
}
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/QueueScheduler.java b/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/QueueScheduler.java
index c38311f2..078506c6 100644
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/QueueScheduler.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/QueueScheduler.java
@@ -26,7 +26,7 @@ public class QueueScheduler extends DuplicateRemovedScheduler implements Monitor
}
@Override
- public synchronized Request poll(Task task) {
+ public Request poll(Task task) {
return queue.poll();
}
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/RegexSelector.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/RegexSelector.java
index 43818965..584cf900 100644
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/RegexSelector.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/RegexSelector.java
@@ -28,8 +28,7 @@ public class RegexSelector implements Selector {
}
// Check bracket for regex group. Add default group 1 if there is no group.
// Only check if there exists the valid left parenthesis, leave regexp validation for Pattern.
- if (StringUtils.countMatches(regexStr, "(") - StringUtils.countMatches(regexStr, "\\(") ==
- StringUtils.countMatches(regexStr, "(?:") - StringUtils.countMatches(regexStr, "\\(?:")) {
+ if ( ! hasGroup(regexStr) ){
regexStr = "(" + regexStr + ")";
}
this.regexStr = regexStr;
@@ -45,6 +44,30 @@ public class RegexSelector implements Selector {
this(regexStr, 1);
}
+ private boolean hasGroup(String regexStr) {
+ if (StringUtils.countMatches(regexStr, "(") - StringUtils.countMatches(regexStr, "\\(") ==
+ StringUtils.countMatches(regexStr, "(?:") - StringUtils.countMatches(regexStr, "\\(?:")){
+ return false;
+ }
+ if (StringUtils.countMatches(regexStr, "(") - StringUtils.countMatches(regexStr, "\\(") ==
+ StringUtils.countMatches(regexStr, "(?=") - StringUtils.countMatches(regexStr, "\\(?=") ) {
+ return false;
+ }
+ if (StringUtils.countMatches(regexStr, "(") - StringUtils.countMatches(regexStr, "\\(") ==
+ StringUtils.countMatches(regexStr, "(?<") - StringUtils.countMatches(regexStr, "\\(?<") ) {
+ return false;
+ }
+ if (StringUtils.countMatches(regexStr, "(") - StringUtils.countMatches(regexStr, "\\(") ==
+ StringUtils.countMatches(regexStr, "(?!") - StringUtils.countMatches(regexStr, "\\(?!") ) {
+ return false;
+ }
+ if (StringUtils.countMatches(regexStr, "(") - StringUtils.countMatches(regexStr, "\\(") ==
+ StringUtils.countMatches(regexStr, "(?#") - StringUtils.countMatches(regexStr, "\\(?#") ) {
+ return false;
+ }
+ return true;
+ }
+
@Override
public String select(String text) {
return selectGroup(text).get(group);
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/utils/CharsetUtils.java b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/CharsetUtils.java
new file mode 100644
index 00000000..50b4f1b6
--- /dev/null
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/CharsetUtils.java
@@ -0,0 +1,61 @@
+package us.codecraft.webmagic.utils;
+
+import org.apache.commons.lang3.StringUtils;
+import org.jsoup.Jsoup;
+import org.jsoup.nodes.Document;
+import org.jsoup.nodes.Element;
+import org.jsoup.select.Elements;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.nio.charset.Charset;
+
+/**
+ * @author code4crafter@gmail.com
+ * Date: 17/3/11
+ * Time: 10:36
+ * @since 0.6.2
+ */
+public abstract class CharsetUtils {
+
+ private static Logger logger = LoggerFactory.getLogger(CharsetUtils.class);
+
+ public static String detectCharset(String contentType, byte[] contentBytes) throws IOException {
+ String charset;
+ // charset
+ // 1、encoding in http header Content-Type
+ charset = UrlUtils.getCharset(contentType);
+ if (StringUtils.isNotBlank(contentType)) {
+ logger.debug("Auto get charset: {}", charset);
+ return charset;
+ }
+ // use default charset to decode first time
+ Charset defaultCharset = Charset.defaultCharset();
+ String content = new String(contentBytes, defaultCharset);
+ // 2、charset in meta
+ if (StringUtils.isNotEmpty(content)) {
+ Document document = Jsoup.parse(content);
+ Elements links = document.select("meta");
+ for (Element link : links) {
+ // 2.1、html4.01
+ String metaContent = link.attr("content");
+ String metaCharset = link.attr("charset");
+ if (metaContent.indexOf("charset") != -1) {
+ metaContent = metaContent.substring(metaContent.indexOf("charset"), metaContent.length());
+ charset = metaContent.split("=")[1];
+ break;
+ }
+ // 2.2、html5
+ else if (StringUtils.isNotEmpty(metaCharset)) {
+ charset = metaCharset;
+ break;
+ }
+ }
+ }
+ logger.debug("Auto get charset: {}", charset);
+ // 3、todo use tools as cpdetector for content decode
+ return charset;
+ }
+
+}
diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/RequestTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/RequestTest.java
new file mode 100644
index 00000000..c7e4943d
--- /dev/null
+++ b/webmagic-core/src/test/java/us/codecraft/webmagic/RequestTest.java
@@ -0,0 +1,25 @@
+package us.codecraft.webmagic;
+
+import org.junit.Test;
+import us.codecraft.webmagic.utils.HttpConstant;
+
+import static org.assertj.core.api.Assertions.assertThat;
+
+/**
+ * @author code4crafter@gmail.com
+ * Date: 17/3/11
+ */
+public class RequestTest {
+
+ @Test
+ public void testEqualsAndHashCode() throws Exception {
+ Request requestA = new Request("http://www.google.com/");
+ Request requestB = new Request("http://www.google.com/");
+ assertThat(requestA.hashCode()).isEqualTo(requestB.hashCode());
+ assertThat(requestA).isEqualTo(requestB);
+ requestA.setMethod(HttpConstant.Method.GET);
+ requestA.setMethod(HttpConstant.Method.POST);
+ assertThat(requestA).isNotEqualTo(requestB);
+ assertThat(requestA.hashCode()).isNotEqualTo(requestB.hashCode());
+ }
+}
diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpClientDownloaderTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpClientDownloaderTest.java
index 1735e00b..0e442a87 100644
--- a/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpClientDownloaderTest.java
+++ b/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/HttpClientDownloaderTest.java
@@ -5,13 +5,17 @@ import com.github.dreamhead.moco.Runnable;
import com.github.dreamhead.moco.Runner;
import org.apache.commons.io.IOUtils;
import org.apache.http.client.methods.CloseableHttpResponse;
+import org.apache.http.client.methods.RequestBuilder;
import org.apache.http.impl.client.CloseableHttpClient;
+import org.apache.http.impl.client.HttpClients;
+import org.apache.http.util.EntityUtils;
import org.junit.Test;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.selector.Html;
+import us.codecraft.webmagic.utils.HttpConstant;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
@@ -103,4 +107,42 @@ public class HttpClientDownloaderTest {
}
});
}
+
+ @Test
+ public void test_selectRequestMethod() throws Exception {
+ HttpServer server = httpserver(12306);
+ server.get(eq(query("q"), "webmagic")).response("get");
+ server.post(eq(form("q"), "webmagic")).response("post");
+ server.put(eq(form("q"), "webmagic")).response("put");
+ server.delete(eq(query("q"), "webmagic")).response("delete");
+ server.request(and(by(method("HEAD")),eq(query("q"), "webmagic"))).response(header("method","head"));
+ server.request(and(by(method("TRACE")),eq(query("q"), "webmagic"))).response("trace");
+ Runner.running(server, new Runnable() {
+ @Override
+ public void run() throws Exception {
+ HttpClientDownloader httpClientDownloader = new HttpClientDownloader();
+ Request request = new Request();
+ request.setUrl("http://127.0.0.1:12306/search");
+ request.putParams("q", "webmagic");
+ request.setMethod(HttpConstant.Method.GET);
+ RequestBuilder requestBuilder = httpClientDownloader.selectRequestMethod(request).setUri(request.getUrl());
+ assertThat(EntityUtils.toString(HttpClients.custom().build().execute(requestBuilder.build()).getEntity())).isEqualTo("get");
+ request.setMethod(HttpConstant.Method.POST);
+ requestBuilder = httpClientDownloader.selectRequestMethod(request).setUri(request.getUrl());
+ assertThat(EntityUtils.toString(HttpClients.custom().build().execute(requestBuilder.build()).getEntity())).isEqualTo("post");
+ request.setMethod(HttpConstant.Method.PUT);
+ requestBuilder = httpClientDownloader.selectRequestMethod(request).setUri(request.getUrl());
+ assertThat(EntityUtils.toString(HttpClients.custom().build().execute(requestBuilder.build()).getEntity())).isEqualTo("put");
+ request.setMethod(HttpConstant.Method.DELETE);
+ requestBuilder = httpClientDownloader.selectRequestMethod(request).setUri(request.getUrl());
+ assertThat(EntityUtils.toString(HttpClients.custom().build().execute(requestBuilder.build()).getEntity())).isEqualTo("delete");
+ request.setMethod(HttpConstant.Method.HEAD);
+ requestBuilder = httpClientDownloader.selectRequestMethod(request).setUri(request.getUrl());
+ assertThat(HttpClients.custom().build().execute(requestBuilder.build()).getFirstHeader("method").getValue()).isEqualTo("head");
+ request.setMethod(HttpConstant.Method.TRACE);
+ requestBuilder = httpClientDownloader.selectRequestMethod(request).setUri(request.getUrl());
+ assertThat(EntityUtils.toString(HttpClients.custom().build().execute(requestBuilder.build()).getEntity())).isEqualTo("trace");
+ }
+ });
+ }
}
diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/scheduler/DuplicateRemovedSchedulerTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/scheduler/DuplicateRemovedSchedulerTest.java
new file mode 100644
index 00000000..a0980494
--- /dev/null
+++ b/webmagic-core/src/test/java/us/codecraft/webmagic/scheduler/DuplicateRemovedSchedulerTest.java
@@ -0,0 +1,50 @@
+package us.codecraft.webmagic.scheduler;
+
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.mockito.Mockito;
+import org.mockito.runners.MockitoJUnitRunner;
+import us.codecraft.webmagic.Request;
+import us.codecraft.webmagic.Task;
+import us.codecraft.webmagic.scheduler.component.DuplicateRemover;
+import us.codecraft.webmagic.utils.HttpConstant;
+
+import static org.mockito.Matchers.any;
+import static org.mockito.Mockito.times;
+import static org.mockito.Mockito.verify;
+
+/**
+ * @author code4crafter@gmail.com
+ * Date: 17/3/11
+ * Time: 上午11:26
+ */
+@RunWith(MockitoJUnitRunner.class)
+public class DuplicateRemovedSchedulerTest {
+
+ private DuplicateRemovedScheduler duplicateRemovedScheduler = new DuplicateRemovedScheduler() {
+ @Override
+ public Request poll(Task task) {
+ return null;
+ }
+ };
+
+ @Test
+ public void test_no_duplicate_removed_for_post_request() throws Exception {
+ DuplicateRemover duplicateRemover = Mockito.mock(DuplicateRemover.class);
+ duplicateRemovedScheduler.setDuplicateRemover(duplicateRemover);
+ Request request = new Request("https://www.google.com/");
+ request.setMethod(HttpConstant.Method.POST);
+ duplicateRemovedScheduler.push(request, null);
+ verify(duplicateRemover,times(0)).isDuplicate(any(Request.class),any(Task.class));
+ }
+
+ @Test
+ public void test_duplicate_removed_for_get_request() throws Exception {
+ DuplicateRemover duplicateRemover = Mockito.mock(DuplicateRemover.class);
+ duplicateRemovedScheduler.setDuplicateRemover(duplicateRemover);
+ Request request = new Request("https://www.google.com/");
+ request.setMethod(HttpConstant.Method.GET);
+ duplicateRemovedScheduler.push(request, null);
+ verify(duplicateRemover,times(1)).isDuplicate(any(Request.class),any(Task.class));
+ }
+}
diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/selector/RegexSelectorTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/selector/RegexSelectorTest.java
index 63e8e43b..144e6fe2 100644
--- a/webmagic-core/src/test/java/us/codecraft/webmagic/selector/RegexSelectorTest.java
+++ b/webmagic-core/src/test/java/us/codecraft/webmagic/selector/RegexSelectorTest.java
@@ -22,4 +22,20 @@ public class RegexSelectorTest {
String select = regexSelector.select(source);
Assertions.assertThat(select).isEqualTo(source);
}
+
+ @Test
+ public void testRegexWithZeroWidthAssertions() {
+ String regex = "^.*(?=\\?)";
+ String source = "hello world?xxxx";
+ RegexSelector regexSelector = new RegexSelector(regex);
+ String select = regexSelector.select(source);
+ Assertions.assertThat(select).isEqualTo("hello world");
+
+
+ regex = "\\d{3}(?!\\d)";
+ source = "123456asdf";
+ regexSelector = new RegexSelector(regex);
+ select = regexSelector.select(source);
+ Assertions.assertThat(select).isEqualTo("456");
+ }
}
diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/utils/UrlUtilsTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/utils/UrlUtilsTest.java
index 86b9db35..a90304dc 100644
--- a/webmagic-core/src/test/java/us/codecraft/webmagic/utils/UrlUtilsTest.java
+++ b/webmagic-core/src/test/java/us/codecraft/webmagic/utils/UrlUtilsTest.java
@@ -20,6 +20,9 @@ public class UrlUtilsTest {
absoluteUrl = UrlUtils.canonicalizeUrl("../aa", "http://www.dianping.com/sh/ss/com");
assertThat(absoluteUrl).isEqualTo("http://www.dianping.com/sh/aa");
+ absoluteUrl = UrlUtils.canonicalizeUrl("../mshz", "http://www.court.gov.cn/zgcpwsw/zgrmfy/");
+ assertThat(absoluteUrl).isEqualTo("http://www.court.gov.cn/zgcpwsw/mshz");
+
absoluteUrl = UrlUtils.canonicalizeUrl("..aa", "http://www.dianping.com/sh/ss/com");
assertThat(absoluteUrl).isEqualTo("http://www.dianping.com/sh/ss/..aa");
diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java
index 61551b13..59f4b3f4 100644
--- a/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java
+++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/scheduler/RedisScheduler.java
@@ -48,11 +48,7 @@ public class RedisScheduler extends DuplicateRemovedScheduler implements Monitor
public boolean isDuplicate(Request request, Task task) {
Jedis jedis = pool.getResource();
try {
- boolean isDuplicate = jedis.sismember(getSetKey(task), request.getUrl());
- if (!isDuplicate) {
- jedis.sadd(getSetKey(task), request.getUrl());
- }
- return isDuplicate;
+ return jedis.sadd(getSetKey(task), request.getUrl()) > 0;
} finally {
pool.returnResource(jedis);
}
diff --git a/webmagic-selenium/pom.xml b/webmagic-selenium/pom.xml
index 6ddc61cf..b66ca0cf 100644
--- a/webmagic-selenium/pom.xml
+++ b/webmagic-selenium/pom.xml
@@ -13,7 +13,7 @@
org.seleniumhq.selenium
selenium-java
- 2.46.0
+ 2.41.0
us.codecraft
diff --git a/webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/WebDriverPool.java b/webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/WebDriverPool.java
index 59f83ea5..1472cb32 100644
--- a/webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/WebDriverPool.java
+++ b/webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/WebDriverPool.java
@@ -45,7 +45,7 @@ class WebDriverPool {
private WebDriver mDriver = null;
private boolean mAutoQuitDriver = true;
- private static final String CONFIG_FILE = "/Users/Bingo/Documents/workspace/webmagic/webmagic-selenium/config.ini";
+ private static final String DEFAULT_CONFIG_FILE = "/data/webmagic/webmagic-selenium/config.ini";
private static final String DRIVER_FIREFOX = "firefox";
private static final String DRIVER_CHROME = "chrome";
private static final String DRIVER_PHANTOMJS = "phantomjs";
@@ -64,7 +64,11 @@ class WebDriverPool {
public void configure() throws IOException {
// Read config file
sConfig = new Properties();
- sConfig.load(new FileReader(CONFIG_FILE));
+ String configFile = DEFAULT_CONFIG_FILE;
+ if (System.getProperty("selenuim_config")!=null){
+ configFile = System.getProperty("selenuim_config");
+ }
+ sConfig.load(new FileReader(configFile));
// Prepare capabilities
sCaps = new DesiredCapabilities();
diff --git a/webmagic-selenium/src/test/java/us/codecraft/webmagic/samples/HuabanProcessor.java b/webmagic-selenium/src/test/java/us/codecraft/webmagic/samples/HuabanProcessor.java
index 2854a766..ad3a3e5b 100644
--- a/webmagic-selenium/src/test/java/us/codecraft/webmagic/samples/HuabanProcessor.java
+++ b/webmagic-selenium/src/test/java/us/codecraft/webmagic/samples/HuabanProcessor.java
@@ -22,7 +22,7 @@ public class HuabanProcessor implements PageProcessor {
public void process(Page page) {
page.addTargetRequests(page.getHtml().links().regex("http://huaban\\.com/.*").all());
if (page.getUrl().toString().contains("pins")) {
- page.putField("img", page.getHtml().xpath("//div[@id='pin_img']/a/img/@src").toString());
+ page.putField("img", page.getHtml().xpath("//div[@class='image-holder']/a/img/@src").toString());
} else {
page.getResultItems().setSkip(true);
}
diff --git a/webmagic-selenium/src/test/resources/config.ini b/webmagic-selenium/src/test/resources/config.ini
new file mode 100644
index 00000000..6bd19af1
--- /dev/null
+++ b/webmagic-selenium/src/test/resources/config.ini
@@ -0,0 +1,11 @@
+#driver=phantomjs
+#driver=firefox
+driver=chrome
+#driver=http://localhost:8910
+driver=http://localhost:4444/wd/hub
+
+# PhantomJS specific config (change according to your installation)
+#phantomjs_exec_path=/Users/detro/bin/phantomjs-qt5
+phantomjs_exec_path=/Users/detro/bin/phantomjs-upstream
+phantomjs_driver_path=../../src/main.js
+phantomjs_driver_loglevel=DEBUG
\ No newline at end of file