add direct download

pull/88/head^2
yihua.huang 11 years ago
parent 86cfefb58c
commit 8f774afc84

@ -68,4 +68,13 @@ public class ResultItems {
this.skip = skip;
return this;
}
@Override
public String toString() {
return "ResultItems{" +
"fields=" + fields +
", request=" + request +
", skip=" + skip +
'}';
}
}

@ -43,6 +43,8 @@ public class Site {
private HttpHost httpProxy;
private boolean useGzip = true;
public static interface HeaderConst {
public static final String REFERER = "Referer";
@ -199,7 +201,10 @@ public class Site {
/**
* Add a url to start url.<br>
* Because urls are more a Spider's property than Site, move it to {@link Spider#addUrl(String...)}}
*
* @deprecated
* @see Spider#addUrl(String...)
* @param startUrl
* @return this
*/
@ -209,7 +214,10 @@ public class Site {
/**
* Add a url to start url.<br>
* Because urls are more a Spider's property than Site, move it to {@link Spider#addRequest(Request...)}}
*
* @deprecated
* @see Spider#addRequest(Request...)
* @param startUrl
* @return this
*/
@ -312,6 +320,22 @@ public class Site {
return this;
}
public boolean isUseGzip() {
return useGzip;
}
/**
* Whether use gzip. <br>
* Default is true, you can set it to false to disable gzip.
*
* @param useGzip
* @return
*/
public Site setUseGzip(boolean useGzip) {
this.useGzip = useGzip;
return this;
}
public Task toTask() {
return new Task() {
@Override

@ -1,9 +1,11 @@
package us.codecraft.webmagic;
import com.google.common.collect.Lists;
import org.apache.commons.collections.CollectionUtils;
import org.apache.log4j.Logger;
import us.codecraft.webmagic.downloader.Downloader;
import us.codecraft.webmagic.downloader.HttpClientDownloader;
import us.codecraft.webmagic.pipeline.CollectorPipeline;
import us.codecraft.webmagic.pipeline.ConsolePipeline;
import us.codecraft.webmagic.pipeline.Pipeline;
import us.codecraft.webmagic.processor.PageProcessor;
@ -16,7 +18,9 @@ import us.codecraft.webmagic.utils.UrlUtils;
import java.io.Closeable;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.List;
import java.util.UUID;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.locks.Condition;
@ -85,6 +89,10 @@ public class Spider implements Runnable, Task {
protected final static int STAT_STOPPED = 2;
protected boolean spawnUrl = true;
protected boolean destroyWhenExit = true;
private ReentrantLock newUrlLock = new ReentrantLock();
private Condition newUrlCondition = newUrlLock.newCondition();
@ -244,7 +252,9 @@ public class Spider implements Runnable, Task {
pipelines.add(new ConsolePipeline());
}
downloader.setThread(threadNum);
executorService = ThreadUtils.newFixedThreadPool(threadNum);
if (executorService == null || executorService.isShutdown()) {
executorService = ThreadUtils.newFixedThreadPool(threadNum);
}
if (startRequests != null) {
for (Request request : startRequests) {
scheduler.push(request, this);
@ -285,10 +295,11 @@ public class Spider implements Runnable, Task {
});
}
}
executorService.shutdown();
stat.set(STAT_STOPPED);
// release some resources
destroy();
if (destroyWhenExit) {
close();
}
}
private void checkRunningStat() {
@ -303,12 +314,13 @@ public class Spider implements Runnable, Task {
}
}
protected void destroy() {
public void close() {
destroyEach(downloader);
destroyEach(pageProcessor);
for (Pipeline pipeline : pipelines) {
destroyEach(pipeline);
}
executorService.shutdown();
}
private void destroyEach(Object object) {
@ -366,7 +378,7 @@ public class Spider implements Runnable, Task {
}
protected void extractAndAddRequests(Page page) {
if (CollectionUtils.isNotEmpty(page.getTargetRequests())) {
if (spawnUrl && CollectionUtils.isNotEmpty(page.getTargetRequests())) {
for (Request request : page.getTargetRequests()) {
addRequest(request);
}
@ -374,8 +386,10 @@ public class Spider implements Runnable, Task {
}
private void addRequest(Request request) {
if (site.getDomain() == null && request != null && request.getUrl() != null) {
site.setDomain(UrlUtils.getDomain(request.getUrl()));
}
scheduler.push(request, this);
}
protected void checkIfRunning() {
@ -391,7 +405,7 @@ public class Spider implements Runnable, Task {
}
/**
* Add urls to crawl.<br/>
* Add urls to crawl. <br/>
*
* @param urls
* @return
@ -404,6 +418,34 @@ public class Spider implements Runnable, Task {
return this;
}
/**
* Download urls synchronizing.
*
* @param urls
* @return
*/
public List<ResultItems> getAll(Collection<String> urls) {
destroyWhenExit = false;
spawnUrl = false;
startRequests = UrlUtils.convertToRequests(urls);
CollectorPipeline collectorPipeline = new CollectorPipeline();
pipelines.add(collectorPipeline);
run();
spawnUrl = true;
destroyWhenExit = true;
return collectorPipeline.getCollector();
}
public ResultItems get(String url) {
List<String> urls = Lists.newArrayList(url);
List<ResultItems> resultItemses = getAll(urls);
if (resultItemses != null && resultItemses.size() > 0) {
return resultItemses.get(0);
} else {
return null;
}
}
/**
* Add urls with information to crawl.<br/>
*
@ -492,6 +534,24 @@ public class Spider implements Runnable, Task {
return this;
}
public boolean isSpawnUrl() {
return spawnUrl;
}
/**
* Whether add urls extracted to download.<br>
* Add urls to download when it is true, and just download seed urls when it is false. <br>
* DO NOT set it unless you know what it means!
*
* @param spawnUrl
* @return
* @since 0.4.0
*/
public Spider setSpawnUrl(boolean spawnUrl) {
this.spawnUrl = spawnUrl;
return this;
}
@Override
public String getUUID() {
if (uuid != null) {
@ -500,7 +560,8 @@ public class Spider implements Runnable, Task {
if (site != null) {
return site.getDomain();
}
return null;
uuid = UUID.randomUUID().toString();
return uuid;
}
@Override

@ -1,8 +1,9 @@
package us.codecraft.webmagic.downloader;
import org.apache.http.*;
import org.apache.http.HttpException;
import org.apache.http.HttpRequest;
import org.apache.http.HttpRequestInterceptor;
import org.apache.http.client.CookieStore;
import org.apache.http.client.entity.GzipDecompressingEntity;
import org.apache.http.config.Registry;
import org.apache.http.config.RegistryBuilder;
import org.apache.http.conn.socket.ConnectionSocketFactory;
@ -19,7 +20,7 @@ import java.util.Map;
/**
* @author code4crafter@gmail.com <br>
* @since 0.3.3
* @since 0.4.0
*/
public class HttpClientGenerator {
@ -46,42 +47,48 @@ public class HttpClientGenerator {
} else {
httpClientBuilder.setUserAgent("");
}
httpClientBuilder.addInterceptorFirst(new HttpRequestInterceptor() {
if (site == null || site.isUseGzip()) {
httpClientBuilder.addInterceptorFirst(new HttpRequestInterceptor() {
public void process(
final HttpRequest request,
final HttpContext context) throws HttpException, IOException {
if (!request.containsHeader("Accept-Encoding")) {
request.addHeader("Accept-Encoding", "gzip");
}
}
}).addInterceptorFirst(new HttpResponseInterceptor() {
public void process(
final HttpResponse response,
final HttpContext context) throws HttpException, IOException {
HttpEntity entity = response.getEntity();
if (entity != null) {
Header ceheader = entity.getContentEncoding();
if (ceheader != null) {
HeaderElement[] codecs = ceheader.getElements();
for (int i = 0; i < codecs.length; i++) {
if (codecs[i].getName().equalsIgnoreCase("gzip")) {
response.setEntity(
new GzipDecompressingEntity(response.getEntity()));
return;
}
}
public void process(
final HttpRequest request,
final HttpContext context) throws HttpException, IOException {
if (!request.containsHeader("Accept-Encoding")) {
request.addHeader("Accept-Encoding", "gzip");
}
}
}
});
if (site!=null){
httpClientBuilder.setRetryHandler(new DefaultHttpRequestRetryHandler(site.getRetryTimes(),true));
}
});
}
// httpClientBuilder.disableContentCompression().addInterceptorFirst(new HttpResponseInterceptor() {
//
// public void process(
// final HttpResponse response,
// final HttpContext context) throws HttpException, IOException {
// if (response.getStatusLine().getStatusCode() != 200) {
// return;
// }
// HttpEntity entity = response.getEntity();
// if (entity != null) {
// Header ceheader = entity.getContentEncoding();
// if (ceheader != null) {
// HeaderElement[] codecs = ceheader.getElements();
// for (int i = 0; i < codecs.length; i++) {
// if (codecs[i].getName().equalsIgnoreCase("gzip")) {
// response.setEntity(
// new GzipDecompressingEntity(response.getEntity()));
// return;
// }
// }
// }
// }
// }
//
// });
if (site != null) {
httpClientBuilder.setRetryHandler(new DefaultHttpRequestRetryHandler(site.getRetryTimes(), true));
}
generateCookie(httpClientBuilder,site);
generateCookie(httpClientBuilder, site);
return httpClientBuilder.build();
}

@ -0,0 +1,25 @@
package us.codecraft.webmagic.pipeline;
import us.codecraft.webmagic.ResultItems;
import us.codecraft.webmagic.Task;
import java.util.ArrayList;
import java.util.List;
/**
* @author code4crafter@gmail.com
* @since 0.4.0
*/
public class CollectorPipeline implements Pipeline{
private List<ResultItems> collector = new ArrayList<ResultItems>();
@Override
public void process(ResultItems resultItems, Task task) {
collector.add(resultItems);
}
public List<ResultItems> getCollector() {
return collector;
}
}

@ -0,0 +1,48 @@
package us.codecraft.webmagic.processor.example;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.ResultItems;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.processor.PageProcessor;
import java.util.ArrayList;
import java.util.List;
/**
* @author code4crafter@gmail.com <br>
* @since 0.4.0
*/
public class BaiduBaikePageProcesser implements PageProcessor {
private Site site = Site.me()//.setHttpProxy(new HttpHost("127.0.0.1",8888))
.setCharset("utf-8").setRetryTimes(3).setSleepTime(1000).setUseGzip(true);
@Override
public void process(Page page) {
page.putField("name", page.getHtml().$("h1.title div.lemmaTitleH1","text").toString());
page.putField("description", page.getHtml().xpath("//div[@id='lemmaContent-0']//div[@class='para']/allText()"));
}
@Override
public Site getSite() {
return site;
}
public static void main(String[] args) {
Spider spider = Spider.create(new BaiduBaikePageProcesser()).thread(2);
List<String> list = new ArrayList<String>();
String urlTemplate = "http://baike.baidu.com/search/word?word=%s&pic=1&sug=1&enc=utf8";
list.add(String.format(urlTemplate,"水力发电"));
list.add(String.format(urlTemplate,"风力发电"));
list.add(String.format(urlTemplate,"太阳能"));
list.add(String.format(urlTemplate,"地热发电"));
list.add(String.format(urlTemplate,"众数"));
list.add(String.format(urlTemplate,"地热发电"));
List<ResultItems> resultItemses = spider.getAll(list);
for (ResultItems resultItemse : resultItemses) {
System.out.println(resultItemse.getAll());
}
spider.close();
}
}

@ -11,7 +11,7 @@ import us.codecraft.webmagic.processor.PageProcessor;
*/
public class GithubRepoPageProcesser implements PageProcessor {
private Site site = Site.me().addStartUrl("https://github.com/code4craft").setRetryTimes(3).setSleepTime(100);
private Site site = Site.me().setRetryTimes(3).setSleepTime(100);
@Override
public void process(Page page) {
@ -31,6 +31,6 @@ public class GithubRepoPageProcesser implements PageProcessor {
}
public static void main(String[] args) {
Spider.create(new GithubRepoPageProcesser()).thread(5).run();
Spider.create(new GithubRepoPageProcesser()).addUrl("https://github.com/code4craft").thread(5).run();
}
}

@ -12,7 +12,7 @@ import java.util.List;
*/
public class OschinaBlogPageProcesser implements PageProcessor {
private Site site = Site.me().setDomain("my.oschina.net").addStartUrl("http://my.oschina.net/flashsword/blog");
private Site site = Site.me().setDomain("my.oschina.net");
@Override
public void process(Page page) {
@ -34,6 +34,6 @@ public class OschinaBlogPageProcesser implements PageProcessor {
}
public static void main(String[] args) {
Spider.create(new OschinaBlogPageProcesser()).thread(2).run();
Spider.create(new OschinaBlogPageProcesser()).addUrl("http://my.oschina.net/flashsword/blog").thread(2).run();
}
}

@ -7,6 +7,7 @@ import java.net.MalformedURLException;
import java.net.URL;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.Collection;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
@ -88,7 +89,7 @@ public class UrlUtils {
return stringBuilder.toString();
}
public static List<Request> convertToRequests(List<String> urls) {
public static List<Request> convertToRequests(Collection<String> urls) {
List<Request> requestList = new ArrayList<Request>(urls.size());
for (String url : urls) {
requestList.add(new Request(url));
@ -96,7 +97,7 @@ public class UrlUtils {
return requestList;
}
public static List<String> convertToUrls(List<Request> requests) {
public static List<String> convertToUrls(Collection<Request> requests) {
List<String> urlList = new ArrayList<String>(requests.size());
for (Request request : requests) {
urlList.add(request.getUrl());

@ -11,7 +11,7 @@ import java.util.ArrayList;
import java.util.List;
/**
* @since 0.3.3
* @since 0.4.0
* NO implement yet!!!!!!!!!!!!
* @author code4crafter@gmail.com
*/

Loading…
Cancel
Save