diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/ResultItems.java b/webmagic-core/src/main/java/us/codecraft/webmagic/ResultItems.java
index e0552709..4791e77a 100644
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/ResultItems.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/ResultItems.java
@@ -68,4 +68,13 @@ public class ResultItems {
this.skip = skip;
return this;
}
+
+ @Override
+ public String toString() {
+ return "ResultItems{" +
+ "fields=" + fields +
+ ", request=" + request +
+ ", skip=" + skip +
+ '}';
+ }
}
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java
index 33e9b8f7..22015c36 100644
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java
@@ -43,6 +43,8 @@ public class Site {
private HttpHost httpProxy;
+ private boolean useGzip = true;
+
public static interface HeaderConst {
public static final String REFERER = "Referer";
@@ -199,7 +201,10 @@ public class Site {
/**
* Add a url to start url.
+ * Because urls are more a Spider's property than Site, move it to {@link Spider#addUrl(String...)}}
*
+ * @deprecated
+ * @see Spider#addUrl(String...)
* @param startUrl
* @return this
*/
@@ -209,7 +214,10 @@ public class Site {
/**
* Add a url to start url.
+ * Because urls are more a Spider's property than Site, move it to {@link Spider#addRequest(Request...)}}
*
+ * @deprecated
+ * @see Spider#addRequest(Request...)
* @param startUrl
* @return this
*/
@@ -312,6 +320,22 @@ public class Site {
return this;
}
+ public boolean isUseGzip() {
+ return useGzip;
+ }
+
+ /**
+ * Whether use gzip.
+ * Default is true, you can set it to false to disable gzip.
+ *
+ * @param useGzip
+ * @return
+ */
+ public Site setUseGzip(boolean useGzip) {
+ this.useGzip = useGzip;
+ return this;
+ }
+
public Task toTask() {
return new Task() {
@Override
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java
index 04ac8942..9a580bde 100644
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java
@@ -1,9 +1,11 @@
package us.codecraft.webmagic;
+import com.google.common.collect.Lists;
import org.apache.commons.collections.CollectionUtils;
import org.apache.log4j.Logger;
import us.codecraft.webmagic.downloader.Downloader;
import us.codecraft.webmagic.downloader.HttpClientDownloader;
+import us.codecraft.webmagic.pipeline.CollectorPipeline;
import us.codecraft.webmagic.pipeline.ConsolePipeline;
import us.codecraft.webmagic.pipeline.Pipeline;
import us.codecraft.webmagic.processor.PageProcessor;
@@ -16,7 +18,9 @@ import us.codecraft.webmagic.utils.UrlUtils;
import java.io.Closeable;
import java.io.IOException;
import java.util.ArrayList;
+import java.util.Collection;
import java.util.List;
+import java.util.UUID;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.locks.Condition;
@@ -85,6 +89,10 @@ public class Spider implements Runnable, Task {
protected final static int STAT_STOPPED = 2;
+ protected boolean spawnUrl = true;
+
+ protected boolean destroyWhenExit = true;
+
private ReentrantLock newUrlLock = new ReentrantLock();
private Condition newUrlCondition = newUrlLock.newCondition();
@@ -244,7 +252,9 @@ public class Spider implements Runnable, Task {
pipelines.add(new ConsolePipeline());
}
downloader.setThread(threadNum);
- executorService = ThreadUtils.newFixedThreadPool(threadNum);
+ if (executorService == null || executorService.isShutdown()) {
+ executorService = ThreadUtils.newFixedThreadPool(threadNum);
+ }
if (startRequests != null) {
for (Request request : startRequests) {
scheduler.push(request, this);
@@ -285,10 +295,11 @@ public class Spider implements Runnable, Task {
});
}
}
- executorService.shutdown();
stat.set(STAT_STOPPED);
// release some resources
- destroy();
+ if (destroyWhenExit) {
+ close();
+ }
}
private void checkRunningStat() {
@@ -303,12 +314,13 @@ public class Spider implements Runnable, Task {
}
}
- protected void destroy() {
+ public void close() {
destroyEach(downloader);
destroyEach(pageProcessor);
for (Pipeline pipeline : pipelines) {
destroyEach(pipeline);
}
+ executorService.shutdown();
}
private void destroyEach(Object object) {
@@ -366,7 +378,7 @@ public class Spider implements Runnable, Task {
}
protected void extractAndAddRequests(Page page) {
- if (CollectionUtils.isNotEmpty(page.getTargetRequests())) {
+ if (spawnUrl && CollectionUtils.isNotEmpty(page.getTargetRequests())) {
for (Request request : page.getTargetRequests()) {
addRequest(request);
}
@@ -374,8 +386,10 @@ public class Spider implements Runnable, Task {
}
private void addRequest(Request request) {
+ if (site.getDomain() == null && request != null && request.getUrl() != null) {
+ site.setDomain(UrlUtils.getDomain(request.getUrl()));
+ }
scheduler.push(request, this);
-
}
protected void checkIfRunning() {
@@ -391,7 +405,7 @@ public class Spider implements Runnable, Task {
}
/**
- * Add urls to crawl.
+ * Add urls to crawl.
*
* @param urls
* @return
@@ -404,6 +418,34 @@ public class Spider implements Runnable, Task {
return this;
}
+ /**
+ * Download urls synchronizing.
+ *
+ * @param urls
+ * @return
+ */
+ public List getAll(Collection urls) {
+ destroyWhenExit = false;
+ spawnUrl = false;
+ startRequests = UrlUtils.convertToRequests(urls);
+ CollectorPipeline collectorPipeline = new CollectorPipeline();
+ pipelines.add(collectorPipeline);
+ run();
+ spawnUrl = true;
+ destroyWhenExit = true;
+ return collectorPipeline.getCollector();
+ }
+
+ public ResultItems get(String url) {
+ List urls = Lists.newArrayList(url);
+ List resultItemses = getAll(urls);
+ if (resultItemses != null && resultItemses.size() > 0) {
+ return resultItemses.get(0);
+ } else {
+ return null;
+ }
+ }
+
/**
* Add urls with information to crawl.
*
@@ -492,6 +534,24 @@ public class Spider implements Runnable, Task {
return this;
}
+ public boolean isSpawnUrl() {
+ return spawnUrl;
+ }
+
+ /**
+ * Whether add urls extracted to download.
+ * Add urls to download when it is true, and just download seed urls when it is false.
+ * DO NOT set it unless you know what it means!
+ *
+ * @param spawnUrl
+ * @return
+ * @since 0.4.0
+ */
+ public Spider setSpawnUrl(boolean spawnUrl) {
+ this.spawnUrl = spawnUrl;
+ return this;
+ }
+
@Override
public String getUUID() {
if (uuid != null) {
@@ -500,7 +560,8 @@ public class Spider implements Runnable, Task {
if (site != null) {
return site.getDomain();
}
- return null;
+ uuid = UUID.randomUUID().toString();
+ return uuid;
}
@Override
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java
index dbc38286..a3319a0c 100644
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java
@@ -1,8 +1,9 @@
package us.codecraft.webmagic.downloader;
-import org.apache.http.*;
+import org.apache.http.HttpException;
+import org.apache.http.HttpRequest;
+import org.apache.http.HttpRequestInterceptor;
import org.apache.http.client.CookieStore;
-import org.apache.http.client.entity.GzipDecompressingEntity;
import org.apache.http.config.Registry;
import org.apache.http.config.RegistryBuilder;
import org.apache.http.conn.socket.ConnectionSocketFactory;
@@ -19,7 +20,7 @@ import java.util.Map;
/**
* @author code4crafter@gmail.com
- * @since 0.3.3
+ * @since 0.4.0
*/
public class HttpClientGenerator {
@@ -46,42 +47,48 @@ public class HttpClientGenerator {
} else {
httpClientBuilder.setUserAgent("");
}
- httpClientBuilder.addInterceptorFirst(new HttpRequestInterceptor() {
+ if (site == null || site.isUseGzip()) {
+ httpClientBuilder.addInterceptorFirst(new HttpRequestInterceptor() {
- public void process(
- final HttpRequest request,
- final HttpContext context) throws HttpException, IOException {
- if (!request.containsHeader("Accept-Encoding")) {
- request.addHeader("Accept-Encoding", "gzip");
- }
-
- }
- }).addInterceptorFirst(new HttpResponseInterceptor() {
-
- public void process(
- final HttpResponse response,
- final HttpContext context) throws HttpException, IOException {
- HttpEntity entity = response.getEntity();
- if (entity != null) {
- Header ceheader = entity.getContentEncoding();
- if (ceheader != null) {
- HeaderElement[] codecs = ceheader.getElements();
- for (int i = 0; i < codecs.length; i++) {
- if (codecs[i].getName().equalsIgnoreCase("gzip")) {
- response.setEntity(
- new GzipDecompressingEntity(response.getEntity()));
- return;
- }
- }
+ public void process(
+ final HttpRequest request,
+ final HttpContext context) throws HttpException, IOException {
+ if (!request.containsHeader("Accept-Encoding")) {
+ request.addHeader("Accept-Encoding", "gzip");
}
- }
- }
- });
- if (site!=null){
- httpClientBuilder.setRetryHandler(new DefaultHttpRequestRetryHandler(site.getRetryTimes(),true));
+ }
+ });
+ }
+// httpClientBuilder.disableContentCompression().addInterceptorFirst(new HttpResponseInterceptor() {
+//
+// public void process(
+// final HttpResponse response,
+// final HttpContext context) throws HttpException, IOException {
+// if (response.getStatusLine().getStatusCode() != 200) {
+// return;
+// }
+// HttpEntity entity = response.getEntity();
+// if (entity != null) {
+// Header ceheader = entity.getContentEncoding();
+// if (ceheader != null) {
+// HeaderElement[] codecs = ceheader.getElements();
+// for (int i = 0; i < codecs.length; i++) {
+// if (codecs[i].getName().equalsIgnoreCase("gzip")) {
+// response.setEntity(
+// new GzipDecompressingEntity(response.getEntity()));
+// return;
+// }
+// }
+// }
+// }
+// }
+//
+// });
+ if (site != null) {
+ httpClientBuilder.setRetryHandler(new DefaultHttpRequestRetryHandler(site.getRetryTimes(), true));
}
- generateCookie(httpClientBuilder,site);
+ generateCookie(httpClientBuilder, site);
return httpClientBuilder.build();
}
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/CollectorPipeline.java b/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/CollectorPipeline.java
new file mode 100644
index 00000000..012c4c56
--- /dev/null
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/CollectorPipeline.java
@@ -0,0 +1,25 @@
+package us.codecraft.webmagic.pipeline;
+
+import us.codecraft.webmagic.ResultItems;
+import us.codecraft.webmagic.Task;
+
+import java.util.ArrayList;
+import java.util.List;
+
+/**
+ * @author code4crafter@gmail.com
+ * @since 0.4.0
+ */
+public class CollectorPipeline implements Pipeline{
+
+ private List collector = new ArrayList();
+
+ @Override
+ public void process(ResultItems resultItems, Task task) {
+ collector.add(resultItems);
+ }
+
+ public List getCollector() {
+ return collector;
+ }
+}
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/processor/example/BaiduBaikePageProcesser.java b/webmagic-core/src/main/java/us/codecraft/webmagic/processor/example/BaiduBaikePageProcesser.java
new file mode 100644
index 00000000..b3e7d78f
--- /dev/null
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/processor/example/BaiduBaikePageProcesser.java
@@ -0,0 +1,48 @@
+package us.codecraft.webmagic.processor.example;
+
+import us.codecraft.webmagic.Page;
+import us.codecraft.webmagic.ResultItems;
+import us.codecraft.webmagic.Site;
+import us.codecraft.webmagic.Spider;
+import us.codecraft.webmagic.processor.PageProcessor;
+
+import java.util.ArrayList;
+import java.util.List;
+
+/**
+ * @author code4crafter@gmail.com
+ * @since 0.4.0
+ */
+public class BaiduBaikePageProcesser implements PageProcessor {
+
+ private Site site = Site.me()//.setHttpProxy(new HttpHost("127.0.0.1",8888))
+ .setCharset("utf-8").setRetryTimes(3).setSleepTime(1000).setUseGzip(true);
+
+ @Override
+ public void process(Page page) {
+ page.putField("name", page.getHtml().$("h1.title div.lemmaTitleH1","text").toString());
+ page.putField("description", page.getHtml().xpath("//div[@id='lemmaContent-0']//div[@class='para']/allText()"));
+ }
+
+ @Override
+ public Site getSite() {
+ return site;
+ }
+
+ public static void main(String[] args) {
+ Spider spider = Spider.create(new BaiduBaikePageProcesser()).thread(2);
+ List list = new ArrayList();
+ String urlTemplate = "http://baike.baidu.com/search/word?word=%s&pic=1&sug=1&enc=utf8";
+ list.add(String.format(urlTemplate,"水力发电"));
+ list.add(String.format(urlTemplate,"风力发电"));
+ list.add(String.format(urlTemplate,"太阳能"));
+ list.add(String.format(urlTemplate,"地热发电"));
+ list.add(String.format(urlTemplate,"众数"));
+ list.add(String.format(urlTemplate,"地热发电"));
+ List resultItemses = spider.getAll(list);
+ for (ResultItems resultItemse : resultItemses) {
+ System.out.println(resultItemse.getAll());
+ }
+ spider.close();
+ }
+}
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/processor/example/GithubRepoPageProcesser.java b/webmagic-core/src/main/java/us/codecraft/webmagic/processor/example/GithubRepoPageProcesser.java
index 0e7e3b92..47f904f9 100644
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/processor/example/GithubRepoPageProcesser.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/processor/example/GithubRepoPageProcesser.java
@@ -11,7 +11,7 @@ import us.codecraft.webmagic.processor.PageProcessor;
*/
public class GithubRepoPageProcesser implements PageProcessor {
- private Site site = Site.me().addStartUrl("https://github.com/code4craft").setRetryTimes(3).setSleepTime(100);
+ private Site site = Site.me().setRetryTimes(3).setSleepTime(100);
@Override
public void process(Page page) {
@@ -31,6 +31,6 @@ public class GithubRepoPageProcesser implements PageProcessor {
}
public static void main(String[] args) {
- Spider.create(new GithubRepoPageProcesser()).thread(5).run();
+ Spider.create(new GithubRepoPageProcesser()).addUrl("https://github.com/code4craft").thread(5).run();
}
}
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/processor/example/OschinaBlogPageProcesser.java b/webmagic-core/src/main/java/us/codecraft/webmagic/processor/example/OschinaBlogPageProcesser.java
index fa8dab6d..4ef830d5 100644
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/processor/example/OschinaBlogPageProcesser.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/processor/example/OschinaBlogPageProcesser.java
@@ -12,7 +12,7 @@ import java.util.List;
*/
public class OschinaBlogPageProcesser implements PageProcessor {
- private Site site = Site.me().setDomain("my.oschina.net").addStartUrl("http://my.oschina.net/flashsword/blog");
+ private Site site = Site.me().setDomain("my.oschina.net");
@Override
public void process(Page page) {
@@ -34,6 +34,6 @@ public class OschinaBlogPageProcesser implements PageProcessor {
}
public static void main(String[] args) {
- Spider.create(new OschinaBlogPageProcesser()).thread(2).run();
+ Spider.create(new OschinaBlogPageProcesser()).addUrl("http://my.oschina.net/flashsword/blog").thread(2).run();
}
}
diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/utils/UrlUtils.java b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/UrlUtils.java
index e45f9487..456b3cc5 100644
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/utils/UrlUtils.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/UrlUtils.java
@@ -7,6 +7,7 @@ import java.net.MalformedURLException;
import java.net.URL;
import java.nio.charset.Charset;
import java.util.ArrayList;
+import java.util.Collection;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
@@ -88,7 +89,7 @@ public class UrlUtils {
return stringBuilder.toString();
}
- public static List convertToRequests(List urls) {
+ public static List convertToRequests(Collection urls) {
List requestList = new ArrayList(urls.size());
for (String url : urls) {
requestList.add(new Request(url));
@@ -96,7 +97,7 @@ public class UrlUtils {
return requestList;
}
- public static List convertToUrls(List requests) {
+ public static List convertToUrls(Collection requests) {
List urlList = new ArrayList(requests.size());
for (Request request : requests) {
urlList.add(request.getUrl());
diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/example/BaiduBaike.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/example/BaiduBaike.java
index 9e630552..edd167de 100644
--- a/webmagic-extension/src/main/java/us/codecraft/webmagic/example/BaiduBaike.java
+++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/example/BaiduBaike.java
@@ -11,7 +11,7 @@ import java.util.ArrayList;
import java.util.List;
/**
- * @since 0.3.3
+ * @since 0.4.0
* NO implement yet!!!!!!!!!!!!
* @author code4crafter@gmail.com
*/