From 8f774afc84898673d58ef79bb2b0ca28fb9ccd80 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Wed, 6 Nov 2013 06:41:04 +0800 Subject: [PATCH] add direct download --- .../us/codecraft/webmagic/ResultItems.java | 9 +++ .../main/java/us/codecraft/webmagic/Site.java | 24 ++++++ .../java/us/codecraft/webmagic/Spider.java | 77 +++++++++++++++++-- .../downloader/HttpClientGenerator.java | 77 ++++++++++--------- .../webmagic/pipeline/CollectorPipeline.java | 25 ++++++ .../example/BaiduBaikePageProcesser.java | 48 ++++++++++++ .../example/GithubRepoPageProcesser.java | 4 +- .../example/OschinaBlogPageProcesser.java | 4 +- .../us/codecraft/webmagic/utils/UrlUtils.java | 5 +- .../webmagic/example/BaiduBaike.java | 2 +- 10 files changed, 225 insertions(+), 50 deletions(-) create mode 100644 webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/CollectorPipeline.java create mode 100644 webmagic-core/src/main/java/us/codecraft/webmagic/processor/example/BaiduBaikePageProcesser.java diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/ResultItems.java b/webmagic-core/src/main/java/us/codecraft/webmagic/ResultItems.java index e0552709..4791e77a 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/ResultItems.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/ResultItems.java @@ -68,4 +68,13 @@ public class ResultItems { this.skip = skip; return this; } + + @Override + public String toString() { + return "ResultItems{" + + "fields=" + fields + + ", request=" + request + + ", skip=" + skip + + '}'; + } } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java index 33e9b8f7..22015c36 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java @@ -43,6 +43,8 @@ public class Site { private HttpHost httpProxy; + private boolean useGzip = true; + public static interface HeaderConst { public static final String REFERER = "Referer"; @@ -199,7 +201,10 @@ public class Site { /** * Add a url to start url.
+ * Because urls are more a Spider's property than Site, move it to {@link Spider#addUrl(String...)}} * + * @deprecated + * @see Spider#addUrl(String...) * @param startUrl * @return this */ @@ -209,7 +214,10 @@ public class Site { /** * Add a url to start url.
+ * Because urls are more a Spider's property than Site, move it to {@link Spider#addRequest(Request...)}} * + * @deprecated + * @see Spider#addRequest(Request...) * @param startUrl * @return this */ @@ -312,6 +320,22 @@ public class Site { return this; } + public boolean isUseGzip() { + return useGzip; + } + + /** + * Whether use gzip.
+ * Default is true, you can set it to false to disable gzip. + * + * @param useGzip + * @return + */ + public Site setUseGzip(boolean useGzip) { + this.useGzip = useGzip; + return this; + } + public Task toTask() { return new Task() { @Override diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java index 04ac8942..9a580bde 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java @@ -1,9 +1,11 @@ package us.codecraft.webmagic; +import com.google.common.collect.Lists; import org.apache.commons.collections.CollectionUtils; import org.apache.log4j.Logger; import us.codecraft.webmagic.downloader.Downloader; import us.codecraft.webmagic.downloader.HttpClientDownloader; +import us.codecraft.webmagic.pipeline.CollectorPipeline; import us.codecraft.webmagic.pipeline.ConsolePipeline; import us.codecraft.webmagic.pipeline.Pipeline; import us.codecraft.webmagic.processor.PageProcessor; @@ -16,7 +18,9 @@ import us.codecraft.webmagic.utils.UrlUtils; import java.io.Closeable; import java.io.IOException; import java.util.ArrayList; +import java.util.Collection; import java.util.List; +import java.util.UUID; import java.util.concurrent.ExecutorService; import java.util.concurrent.atomic.AtomicInteger; import java.util.concurrent.locks.Condition; @@ -85,6 +89,10 @@ public class Spider implements Runnable, Task { protected final static int STAT_STOPPED = 2; + protected boolean spawnUrl = true; + + protected boolean destroyWhenExit = true; + private ReentrantLock newUrlLock = new ReentrantLock(); private Condition newUrlCondition = newUrlLock.newCondition(); @@ -244,7 +252,9 @@ public class Spider implements Runnable, Task { pipelines.add(new ConsolePipeline()); } downloader.setThread(threadNum); - executorService = ThreadUtils.newFixedThreadPool(threadNum); + if (executorService == null || executorService.isShutdown()) { + executorService = ThreadUtils.newFixedThreadPool(threadNum); + } if (startRequests != null) { for (Request request : startRequests) { scheduler.push(request, this); @@ -285,10 +295,11 @@ public class Spider implements Runnable, Task { }); } } - executorService.shutdown(); stat.set(STAT_STOPPED); // release some resources - destroy(); + if (destroyWhenExit) { + close(); + } } private void checkRunningStat() { @@ -303,12 +314,13 @@ public class Spider implements Runnable, Task { } } - protected void destroy() { + public void close() { destroyEach(downloader); destroyEach(pageProcessor); for (Pipeline pipeline : pipelines) { destroyEach(pipeline); } + executorService.shutdown(); } private void destroyEach(Object object) { @@ -366,7 +378,7 @@ public class Spider implements Runnable, Task { } protected void extractAndAddRequests(Page page) { - if (CollectionUtils.isNotEmpty(page.getTargetRequests())) { + if (spawnUrl && CollectionUtils.isNotEmpty(page.getTargetRequests())) { for (Request request : page.getTargetRequests()) { addRequest(request); } @@ -374,8 +386,10 @@ public class Spider implements Runnable, Task { } private void addRequest(Request request) { + if (site.getDomain() == null && request != null && request.getUrl() != null) { + site.setDomain(UrlUtils.getDomain(request.getUrl())); + } scheduler.push(request, this); - } protected void checkIfRunning() { @@ -391,7 +405,7 @@ public class Spider implements Runnable, Task { } /** - * Add urls to crawl.
+ * Add urls to crawl.
* * @param urls * @return @@ -404,6 +418,34 @@ public class Spider implements Runnable, Task { return this; } + /** + * Download urls synchronizing. + * + * @param urls + * @return + */ + public List getAll(Collection urls) { + destroyWhenExit = false; + spawnUrl = false; + startRequests = UrlUtils.convertToRequests(urls); + CollectorPipeline collectorPipeline = new CollectorPipeline(); + pipelines.add(collectorPipeline); + run(); + spawnUrl = true; + destroyWhenExit = true; + return collectorPipeline.getCollector(); + } + + public ResultItems get(String url) { + List urls = Lists.newArrayList(url); + List resultItemses = getAll(urls); + if (resultItemses != null && resultItemses.size() > 0) { + return resultItemses.get(0); + } else { + return null; + } + } + /** * Add urls with information to crawl.
* @@ -492,6 +534,24 @@ public class Spider implements Runnable, Task { return this; } + public boolean isSpawnUrl() { + return spawnUrl; + } + + /** + * Whether add urls extracted to download.
+ * Add urls to download when it is true, and just download seed urls when it is false.
+ * DO NOT set it unless you know what it means! + * + * @param spawnUrl + * @return + * @since 0.4.0 + */ + public Spider setSpawnUrl(boolean spawnUrl) { + this.spawnUrl = spawnUrl; + return this; + } + @Override public String getUUID() { if (uuid != null) { @@ -500,7 +560,8 @@ public class Spider implements Runnable, Task { if (site != null) { return site.getDomain(); } - return null; + uuid = UUID.randomUUID().toString(); + return uuid; } @Override diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java index dbc38286..a3319a0c 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java @@ -1,8 +1,9 @@ package us.codecraft.webmagic.downloader; -import org.apache.http.*; +import org.apache.http.HttpException; +import org.apache.http.HttpRequest; +import org.apache.http.HttpRequestInterceptor; import org.apache.http.client.CookieStore; -import org.apache.http.client.entity.GzipDecompressingEntity; import org.apache.http.config.Registry; import org.apache.http.config.RegistryBuilder; import org.apache.http.conn.socket.ConnectionSocketFactory; @@ -19,7 +20,7 @@ import java.util.Map; /** * @author code4crafter@gmail.com
- * @since 0.3.3 + * @since 0.4.0 */ public class HttpClientGenerator { @@ -46,42 +47,48 @@ public class HttpClientGenerator { } else { httpClientBuilder.setUserAgent(""); } - httpClientBuilder.addInterceptorFirst(new HttpRequestInterceptor() { + if (site == null || site.isUseGzip()) { + httpClientBuilder.addInterceptorFirst(new HttpRequestInterceptor() { - public void process( - final HttpRequest request, - final HttpContext context) throws HttpException, IOException { - if (!request.containsHeader("Accept-Encoding")) { - request.addHeader("Accept-Encoding", "gzip"); - } - - } - }).addInterceptorFirst(new HttpResponseInterceptor() { - - public void process( - final HttpResponse response, - final HttpContext context) throws HttpException, IOException { - HttpEntity entity = response.getEntity(); - if (entity != null) { - Header ceheader = entity.getContentEncoding(); - if (ceheader != null) { - HeaderElement[] codecs = ceheader.getElements(); - for (int i = 0; i < codecs.length; i++) { - if (codecs[i].getName().equalsIgnoreCase("gzip")) { - response.setEntity( - new GzipDecompressingEntity(response.getEntity())); - return; - } - } + public void process( + final HttpRequest request, + final HttpContext context) throws HttpException, IOException { + if (!request.containsHeader("Accept-Encoding")) { + request.addHeader("Accept-Encoding", "gzip"); } - } - } - }); - if (site!=null){ - httpClientBuilder.setRetryHandler(new DefaultHttpRequestRetryHandler(site.getRetryTimes(),true)); + } + }); + } +// httpClientBuilder.disableContentCompression().addInterceptorFirst(new HttpResponseInterceptor() { +// +// public void process( +// final HttpResponse response, +// final HttpContext context) throws HttpException, IOException { +// if (response.getStatusLine().getStatusCode() != 200) { +// return; +// } +// HttpEntity entity = response.getEntity(); +// if (entity != null) { +// Header ceheader = entity.getContentEncoding(); +// if (ceheader != null) { +// HeaderElement[] codecs = ceheader.getElements(); +// for (int i = 0; i < codecs.length; i++) { +// if (codecs[i].getName().equalsIgnoreCase("gzip")) { +// response.setEntity( +// new GzipDecompressingEntity(response.getEntity())); +// return; +// } +// } +// } +// } +// } +// +// }); + if (site != null) { + httpClientBuilder.setRetryHandler(new DefaultHttpRequestRetryHandler(site.getRetryTimes(), true)); } - generateCookie(httpClientBuilder,site); + generateCookie(httpClientBuilder, site); return httpClientBuilder.build(); } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/CollectorPipeline.java b/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/CollectorPipeline.java new file mode 100644 index 00000000..012c4c56 --- /dev/null +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/CollectorPipeline.java @@ -0,0 +1,25 @@ +package us.codecraft.webmagic.pipeline; + +import us.codecraft.webmagic.ResultItems; +import us.codecraft.webmagic.Task; + +import java.util.ArrayList; +import java.util.List; + +/** + * @author code4crafter@gmail.com + * @since 0.4.0 + */ +public class CollectorPipeline implements Pipeline{ + + private List collector = new ArrayList(); + + @Override + public void process(ResultItems resultItems, Task task) { + collector.add(resultItems); + } + + public List getCollector() { + return collector; + } +} diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/processor/example/BaiduBaikePageProcesser.java b/webmagic-core/src/main/java/us/codecraft/webmagic/processor/example/BaiduBaikePageProcesser.java new file mode 100644 index 00000000..b3e7d78f --- /dev/null +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/processor/example/BaiduBaikePageProcesser.java @@ -0,0 +1,48 @@ +package us.codecraft.webmagic.processor.example; + +import us.codecraft.webmagic.Page; +import us.codecraft.webmagic.ResultItems; +import us.codecraft.webmagic.Site; +import us.codecraft.webmagic.Spider; +import us.codecraft.webmagic.processor.PageProcessor; + +import java.util.ArrayList; +import java.util.List; + +/** + * @author code4crafter@gmail.com
+ * @since 0.4.0 + */ +public class BaiduBaikePageProcesser implements PageProcessor { + + private Site site = Site.me()//.setHttpProxy(new HttpHost("127.0.0.1",8888)) + .setCharset("utf-8").setRetryTimes(3).setSleepTime(1000).setUseGzip(true); + + @Override + public void process(Page page) { + page.putField("name", page.getHtml().$("h1.title div.lemmaTitleH1","text").toString()); + page.putField("description", page.getHtml().xpath("//div[@id='lemmaContent-0']//div[@class='para']/allText()")); + } + + @Override + public Site getSite() { + return site; + } + + public static void main(String[] args) { + Spider spider = Spider.create(new BaiduBaikePageProcesser()).thread(2); + List list = new ArrayList(); + String urlTemplate = "http://baike.baidu.com/search/word?word=%s&pic=1&sug=1&enc=utf8"; + list.add(String.format(urlTemplate,"水力发电")); + list.add(String.format(urlTemplate,"风力发电")); + list.add(String.format(urlTemplate,"太阳能")); + list.add(String.format(urlTemplate,"地热发电")); + list.add(String.format(urlTemplate,"众数")); + list.add(String.format(urlTemplate,"地热发电")); + List resultItemses = spider.getAll(list); + for (ResultItems resultItemse : resultItemses) { + System.out.println(resultItemse.getAll()); + } + spider.close(); + } +} diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/processor/example/GithubRepoPageProcesser.java b/webmagic-core/src/main/java/us/codecraft/webmagic/processor/example/GithubRepoPageProcesser.java index 0e7e3b92..47f904f9 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/processor/example/GithubRepoPageProcesser.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/processor/example/GithubRepoPageProcesser.java @@ -11,7 +11,7 @@ import us.codecraft.webmagic.processor.PageProcessor; */ public class GithubRepoPageProcesser implements PageProcessor { - private Site site = Site.me().addStartUrl("https://github.com/code4craft").setRetryTimes(3).setSleepTime(100); + private Site site = Site.me().setRetryTimes(3).setSleepTime(100); @Override public void process(Page page) { @@ -31,6 +31,6 @@ public class GithubRepoPageProcesser implements PageProcessor { } public static void main(String[] args) { - Spider.create(new GithubRepoPageProcesser()).thread(5).run(); + Spider.create(new GithubRepoPageProcesser()).addUrl("https://github.com/code4craft").thread(5).run(); } } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/processor/example/OschinaBlogPageProcesser.java b/webmagic-core/src/main/java/us/codecraft/webmagic/processor/example/OschinaBlogPageProcesser.java index fa8dab6d..4ef830d5 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/processor/example/OschinaBlogPageProcesser.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/processor/example/OschinaBlogPageProcesser.java @@ -12,7 +12,7 @@ import java.util.List; */ public class OschinaBlogPageProcesser implements PageProcessor { - private Site site = Site.me().setDomain("my.oschina.net").addStartUrl("http://my.oschina.net/flashsword/blog"); + private Site site = Site.me().setDomain("my.oschina.net"); @Override public void process(Page page) { @@ -34,6 +34,6 @@ public class OschinaBlogPageProcesser implements PageProcessor { } public static void main(String[] args) { - Spider.create(new OschinaBlogPageProcesser()).thread(2).run(); + Spider.create(new OschinaBlogPageProcesser()).addUrl("http://my.oschina.net/flashsword/blog").thread(2).run(); } } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/utils/UrlUtils.java b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/UrlUtils.java index e45f9487..456b3cc5 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/utils/UrlUtils.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/UrlUtils.java @@ -7,6 +7,7 @@ import java.net.MalformedURLException; import java.net.URL; import java.nio.charset.Charset; import java.util.ArrayList; +import java.util.Collection; import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; @@ -88,7 +89,7 @@ public class UrlUtils { return stringBuilder.toString(); } - public static List convertToRequests(List urls) { + public static List convertToRequests(Collection urls) { List requestList = new ArrayList(urls.size()); for (String url : urls) { requestList.add(new Request(url)); @@ -96,7 +97,7 @@ public class UrlUtils { return requestList; } - public static List convertToUrls(List requests) { + public static List convertToUrls(Collection requests) { List urlList = new ArrayList(requests.size()); for (Request request : requests) { urlList.add(request.getUrl()); diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/example/BaiduBaike.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/example/BaiduBaike.java index 9e630552..edd167de 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/example/BaiduBaike.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/example/BaiduBaike.java @@ -11,7 +11,7 @@ import java.util.ArrayList; import java.util.List; /** - * @since 0.3.3 + * @since 0.4.0 * NO implement yet!!!!!!!!!!!! * @author code4crafter@gmail.com */