update redisscheduler

pull/17/head
yihua.huang 12 years ago
parent 787b952932
commit 0f2c5b5723

@ -148,4 +148,15 @@ public class Page {
public ResultItems getResultItems() {
return resultItems;
}
@Override
public String toString() {
return "Page{" +
"request=" + request +
", resultItems=" + resultItems +
", html=" + html +
", url=" + url +
", targetRequests=" + targetRequests +
'}';
}
}

@ -113,4 +113,13 @@ public class Request implements Serializable {
public void setUrl(String url) {
this.url = url;
}
@Override
public String toString() {
return "Request{" +
"url='" + url + '\'' +
", extras=" + extras +
", priority=" + priority +
'}';
}
}

@ -40,33 +40,33 @@ import java.util.concurrent.atomic.AtomicInteger;
*/
public class Spider implements Runnable, Task {
private Downloader downloader;
protected Downloader downloader;
private List<Pipeline> pipelines = new ArrayList<Pipeline>();
protected List<Pipeline> pipelines = new ArrayList<Pipeline>();
private PageProcessor pageProcessor;
protected PageProcessor pageProcessor;
private List<String> startUrls;
protected List<String> startUrls;
private Site site;
protected Site site;
private String uuid;
protected String uuid;
private Scheduler scheduler = new QueueScheduler();
protected Scheduler scheduler = new QueueScheduler();
private Logger logger = Logger.getLogger(getClass());
protected Logger logger = Logger.getLogger(getClass());
private ExecutorService executorService;
protected ExecutorService executorService;
private int threadNum = 1;
protected int threadNum = 1;
private AtomicInteger stat = new AtomicInteger(STAT_INIT);
protected AtomicInteger stat = new AtomicInteger(STAT_INIT);
private final static int STAT_INIT = 0;
protected final static int STAT_INIT = 0;
private final static int STAT_RUNNING = 1;
protected final static int STAT_RUNNING = 1;
private final static int STAT_STOPPED = 2;
protected final static int STAT_STOPPED = 2;
/**
* 使Spider
@ -206,7 +206,7 @@ public class Spider implements Runnable, Task {
destroy();
}
private void destroy() {
protected void destroy() {
destroyEach(downloader);
destroyEach(pageProcessor);
for (Pipeline pipeline : pipelines) {
@ -233,7 +233,7 @@ public class Spider implements Runnable, Task {
}
}
private void processRequest(Request request) {
protected void processRequest(Request request) {
Page page = downloader.download(request, this);
if (page == null) {
sleep(site.getSleepTime());
@ -249,7 +249,7 @@ public class Spider implements Runnable, Task {
sleep(site.getSleepTime());
}
private void sleep(int time) {
protected void sleep(int time) {
try {
Thread.sleep(time);
} catch (InterruptedException e) {
@ -257,7 +257,7 @@ public class Spider implements Runnable, Task {
}
}
private void addRequest(Page page) {
protected void addRequest(Page page) {
if (CollectionUtils.isNotEmpty(page.getTargetRequests())) {
for (Request request : page.getTargetRequests()) {
scheduler.push(request, this);
@ -265,7 +265,7 @@ public class Spider implements Runnable, Task {
}
}
private void checkIfNotRunning() {
protected void checkIfNotRunning() {
if (!stat.compareAndSet(STAT_INIT, STAT_INIT)) {
throw new IllegalStateException("Spider is already running!");
}

@ -66,13 +66,7 @@ public class HttpClientDownloader implements Downloader {
}
//
handleGzip(httpResponse);
String content = IOUtils.toString(httpResponse.getEntity().getContent(),
charset);
Page page = new Page();
page.setHtml(new Html(UrlUtils.fixAllRelativeHrefs(content, request.getUrl())));
page.setUrl(new PlainText(request.getUrl()));
page.setRequest(request);
return page;
return handleResponse(request, charset, httpResponse,task);
} else {
logger.warn("code error " + statusCode + "\t" + request.getUrl());
}
@ -82,6 +76,16 @@ public class HttpClientDownloader implements Downloader {
return null;
}
protected Page handleResponse(Request request, String charset, HttpResponse httpResponse,Task task) throws IOException {
String content = IOUtils.toString(httpResponse.getEntity().getContent(),
charset);
Page page = new Page();
page.setHtml(new Html(UrlUtils.fixAllRelativeHrefs(content, request.getUrl())));
page.setUrl(new PlainText(request.getUrl()));
page.setRequest(request);
return page;
}
@Override
public void setThread(int thread) {
poolSize=thread;

@ -4,8 +4,8 @@ import org.apache.commons.codec.digest.DigestUtils;
import org.apache.log4j.Logger;
import us.codecraft.webmagic.ResultItems;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.utils.FilePersistentBase;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.io.PrintWriter;
@ -18,9 +18,7 @@ import java.util.Map;
* Date: 13-4-21
* Time: 6:28
*/
public class FilePipeline implements Pipeline {
private String path = "/data/webmagic/";
public class FilePipeline extends FilePersistentBase implements Pipeline {
private Logger logger = Logger.getLogger(getClass());
@ -28,7 +26,7 @@ public class FilePipeline implements Pipeline {
* FilePipeline使"/data/webmagic/"
*/
public FilePipeline() {
setPath("/data/webmagic/");
}
/**
@ -37,21 +35,14 @@ public class FilePipeline implements Pipeline {
* @param path
*/
public FilePipeline(String path) {
if (!path.endsWith("/")&&!path.endsWith("\\")){
path+="/";
}
this.path = path;
setPath(path);
}
@Override
public void process(ResultItems resultItems, Task task) {
String path = this.path + "/" + task.getUUID() + "/";
File file = new File(path);
if (!file.exists()) {
file.mkdirs();
}
String path = this.path + PATH_SEPERATOR + task.getUUID() + PATH_SEPERATOR;
try {
PrintWriter printWriter = new PrintWriter(new FileWriter(path + DigestUtils.md5Hex(resultItems.getRequest().getUrl()) + ".html"));
PrintWriter printWriter = new PrintWriter(new FileWriter(getFile(path + DigestUtils.md5Hex(resultItems.getRequest().getUrl()) + ".html")));
printWriter.println("url:\t" + resultItems.getRequest().getUrl());
for (Map.Entry<String, Object> entry : resultItems.getAll().entrySet()) {
if (entry.getValue() instanceof Iterable) {

@ -0,0 +1,51 @@
package us.codecraft.webmagic.utils;
import java.io.File;
/**
* <br>
*
* @author code4crafter@gmail.com <br>
* Date: 13-8-11 <br>
* Time: 4:21 <br>
*/
public class FilePersistentBase {
protected String path;
public static String PATH_SEPERATOR = "/";
static {
String property = System.getProperties().getProperty("file.separator");
if (property != null) {
PATH_SEPERATOR = property;
}
}
public void setPath(String path) {
this.path = path;
if (!path.endsWith(PATH_SEPERATOR)) {
path += PATH_SEPERATOR;
}
}
public File getFile(String fullName) {
checkAndMakeParentDirecotry(fullName);
return new File(fullName);
}
public void checkAndMakeParentDirecotry(String fullName) {
int index = fullName.lastIndexOf(PATH_SEPERATOR);
if (index > 0) {
String path = fullName.substring(0, index);
File file = new File(path);
if (!file.exists()) {
file.mkdirs();
}
}
}
public String getPath() {
return path;
}
}

@ -2,6 +2,7 @@ package us.codecraft.webmagic.model;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.processor.PageProcessor;
/**
* ModelSpider<br>
@ -20,6 +21,10 @@ public class OOSpider extends Spider {
this.modelPageProcessor = modelPageProcessor;
}
public OOSpider(PageProcessor pageProcessor) {
super(pageProcessor);
}
/**
* <br>
* @param site

@ -7,8 +7,8 @@ import org.apache.log4j.Logger;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.model.HasKey;
import us.codecraft.webmagic.model.PageModelPipeline;
import us.codecraft.webmagic.utils.FilePersistentBase;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.io.PrintWriter;
@ -21,38 +21,29 @@ import java.io.PrintWriter;
* Date: 13-4-21
* Time: 6:28
*/
public class JsonFilePageModelPipeline implements PageModelPipeline {
private String path = "/data/webmagic/";
public class JsonFilePageModelPipeline extends FilePersistentBase implements PageModelPipeline {
private Logger logger = Logger.getLogger(getClass());
/**
* FilePipeline使"/data/webmagic/"
* JsonFilePageModelPipeline使"/data/webmagic/"
*/
public JsonFilePageModelPipeline() {
setPath("/data/webmagic/");
}
/**
* FilePipeline
* JsonFilePageModelPipeline
*
* @param path
*/
public JsonFilePageModelPipeline(String path) {
if (!path.endsWith("/") && !path.endsWith("\\")) {
path += "/";
}
this.path = path;
setPath(path);
}
@Override
public void process(Object o, Task task) {
String path = this.path + "/" + task.getUUID() + "/";
File file = new File(path);
if (!file.exists()) {
file.mkdirs();
}
try {
String filename;
if (o instanceof HasKey) {
@ -60,7 +51,7 @@ public class JsonFilePageModelPipeline implements PageModelPipeline {
} else {
filename = path + DigestUtils.md5Hex(ToStringBuilder.reflectionToString(o)) + ".json";
}
PrintWriter printWriter = new PrintWriter(new FileWriter(filename));
PrintWriter printWriter = new PrintWriter(new FileWriter(getFile(filename)));
printWriter.write(JSON.toJSONString(o));
printWriter.close();
} catch (IOException e) {

@ -5,6 +5,7 @@ import org.apache.commons.codec.digest.DigestUtils;
import org.apache.log4j.Logger;
import us.codecraft.webmagic.ResultItems;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.utils.FilePersistentBase;
import java.io.File;
import java.io.FileWriter;
@ -18,40 +19,31 @@ import java.io.PrintWriter;
* Date: 13-4-21
* Time: 6:28
*/
public class JsonFilePipeline implements Pipeline {
private String path = "/data/webmagic/";
public class JsonFilePipeline extends FilePersistentBase implements Pipeline {
private Logger logger = Logger.getLogger(getClass());
/**
* FilePipeline使"/data/webmagic/"
* JsonFilePipeline使"/data/webmagic/"
*/
public JsonFilePipeline() {
setPath("/data/webmagic");
}
/**
* FilePipeline
* JsonFilePipeline
*
* @param path
*/
public JsonFilePipeline(String path) {
if (!path.endsWith("/")&&!path.endsWith("\\")){
path+="/";
}
this.path = path;
setPath(path);
}
@Override
public void process(ResultItems resultItems, Task task) {
String path = this.path + "/" + task.getUUID() + "/";
File file = new File(path);
if (!file.exists()) {
file.mkdirs();
}
try {
PrintWriter printWriter = new PrintWriter(new FileWriter(path + DigestUtils.md5Hex(resultItems.getRequest().getUrl()) + ".json"));
PrintWriter printWriter = new PrintWriter(new FileWriter(new File(path + DigestUtils.md5Hex(resultItems.getRequest().getUrl()) + ".json")));
printWriter.write(JSON.toJSONString(resultItems.getAll()));
printWriter.close();
} catch (IOException e) {

@ -12,8 +12,8 @@ import us.codecraft.webmagic.Task;
* 使redisurl<br>
*
* @author code4crafter@gmail.com <br>
* Date: 13-7-25 <br>
* Time: 7:07 <br>
* Date: 13-7-25 <br>
* Time: 7:07 <br>
*/
public class RedisScheduler implements Scheduler {
@ -32,34 +32,42 @@ public class RedisScheduler implements Scheduler {
@Override
public synchronized void push(Request request, Task task) {
Jedis jedis = pool.getResource();
//使用SortedSet进行url去重
if (jedis.zrank(SET_PREFIX + task.getUUID(), request.getUrl()) == null) {
//使用List保存队列
jedis.rpush(QUEUE_PREFIX + task.getUUID(), request.getUrl());
jedis.zadd(SET_PREFIX + task.getUUID(), request.getPriority(), request.getUrl());
if (request.getExtras() != null) {
String key = ITEM_PREFIX + DigestUtils.shaHex(request.getUrl());
byte[] bytes = JSON.toJSONString(request).getBytes();
jedis.set(key.getBytes(), bytes);
try {
//使用Set进行url去重
if (!jedis.sismember(SET_PREFIX + task.getUUID(), request.getUrl())) {
//使用List保存队列
jedis.rpush(QUEUE_PREFIX + task.getUUID(), request.getUrl());
jedis.sadd(SET_PREFIX + task.getUUID(), request.getUrl());
if (request.getExtras() != null) {
String field = DigestUtils.shaHex(request.getUrl());
String value = JSON.toJSONString(request);
jedis.hset((ITEM_PREFIX + task.getUUID()), field, value);
}
}
} finally {
pool.returnResource(jedis);
}
pool.returnResource(jedis);
}
@Override
public synchronized Request poll(Task task) {
Jedis jedis = pool.getResource();
String url = jedis.lpop(QUEUE_PREFIX + task.getUUID());
if (url == null) {
return null;
}
String key = ITEM_PREFIX + DigestUtils.shaHex(url);
byte[] bytes = jedis.get(key.getBytes());
if (bytes != null) {
Request o = JSON.parseObject(new String(bytes),Request.class);
return o;
try {
String url = jedis.lpop(QUEUE_PREFIX + task.getUUID());
if (url == null) {
return null;
}
String key = ITEM_PREFIX + task.getUUID();
String field = DigestUtils.shaHex(url);
byte[] bytes = jedis.hget(key.getBytes(),field.getBytes());
if (bytes != null) {
Request o = JSON.parseObject(new String(bytes), Request.class);
return o;
}
Request request = new Request(url);
return request;
} finally {
pool.returnResource(jedis);
}
pool.returnResource(jedis);
return new Request(url);
}
}

Loading…
Cancel
Save