more abstraction in scheduler
parent
b0fb1c3e10
commit
1104122979
@ -0,0 +1,45 @@
|
||||
package us.codecraft.webmagic.scheduler;
|
||||
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import us.codecraft.webmagic.Request;
|
||||
import us.codecraft.webmagic.Task;
|
||||
|
||||
/**
|
||||
* Remove duplicate urls and only push urls which are not duplicate.<br></br>
|
||||
*
|
||||
* @author code4crafer@gmail.com
|
||||
* @since 0.5.0
|
||||
*/
|
||||
public abstract class DuplicatedRemoveScheduler implements Scheduler {
|
||||
|
||||
protected Logger logger = LoggerFactory.getLogger(getClass());
|
||||
|
||||
@Override
|
||||
public void push(Request request, Task task) {
|
||||
logger.trace("get a candidate url {}", request.getUrl());
|
||||
if (isDuplicate(request, task) || shouldReserved(request)) {
|
||||
logger.debug("push to queue {}", request.getUrl());
|
||||
pushWhenNoDuplicate(request, task);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Reset duplicate check.
|
||||
*/
|
||||
public abstract void resetDuplicateCheck(Task task);
|
||||
|
||||
/**
|
||||
* @param request
|
||||
* @return
|
||||
*/
|
||||
protected abstract boolean isDuplicate(Request request, Task task);
|
||||
|
||||
protected boolean shouldReserved(Request request) {
|
||||
return request.getExtra(Request.CYCLE_TRIED_TIMES) != null;
|
||||
}
|
||||
|
||||
protected void pushWhenNoDuplicate(Request request, Task task) {
|
||||
|
||||
}
|
||||
}
|
@ -0,0 +1,34 @@
|
||||
package us.codecraft.webmagic.scheduler;
|
||||
|
||||
import com.google.common.collect.Sets;
|
||||
import us.codecraft.webmagic.Request;
|
||||
import us.codecraft.webmagic.Task;
|
||||
|
||||
import java.util.Set;
|
||||
import java.util.concurrent.ConcurrentHashMap;
|
||||
|
||||
/**
|
||||
* Base Scheduler with duplicated urls removed by hash set.<br></br>
|
||||
*
|
||||
* @author code4crafter@gmail.com
|
||||
* @since 0.5.0
|
||||
*/
|
||||
public abstract class LocalDuplicatedRemoveScheduler extends DuplicatedRemoveScheduler implements MonitorableScheduler {
|
||||
|
||||
private Set<String> urls = Sets.newSetFromMap(new ConcurrentHashMap<String, Boolean>());
|
||||
|
||||
@Override
|
||||
public void resetDuplicateCheck(Task task) {
|
||||
urls.clear();
|
||||
}
|
||||
|
||||
@Override
|
||||
protected boolean isDuplicate(Request request, Task task) {
|
||||
return urls.add(request.getUrl());
|
||||
}
|
||||
|
||||
@Override
|
||||
public int getTotalRequestsCount(Task task) {
|
||||
return urls.size();
|
||||
}
|
||||
}
|
@ -1,47 +0,0 @@
|
||||
package us.codecraft.webmagic.scheduler;
|
||||
|
||||
import com.google.common.collect.Sets;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import us.codecraft.webmagic.Request;
|
||||
import us.codecraft.webmagic.Task;
|
||||
|
||||
import java.util.Set;
|
||||
import java.util.concurrent.ConcurrentHashMap;
|
||||
|
||||
/**
|
||||
* Base Scheduler with duplicated urls removed locally.
|
||||
*
|
||||
* @author code4crafter@gmail.com
|
||||
* @since 0.5.0
|
||||
*/
|
||||
public abstract class LocalDuplicatedRemovedScheduler implements MonitorableScheduler {
|
||||
|
||||
protected Logger logger = LoggerFactory.getLogger(getClass());
|
||||
|
||||
private Set<String> urls = Sets.newSetFromMap(new ConcurrentHashMap<String, Boolean>());
|
||||
|
||||
@Override
|
||||
public void push(Request request, Task task) {
|
||||
logger.trace("get a candidate url {}", request.getUrl());
|
||||
if (isDuplicate(request) || shouldReserved(request)) {
|
||||
logger.debug("push to queue {}", request.getUrl());
|
||||
pushWhenNoDuplicate(request, task);
|
||||
}
|
||||
}
|
||||
|
||||
protected boolean isDuplicate(Request request) {
|
||||
return urls.add(request.getUrl());
|
||||
}
|
||||
|
||||
protected boolean shouldReserved(Request request) {
|
||||
return request.getExtra(Request.CYCLE_TRIED_TIMES) != null;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int getTotalRequestsCount(Task task) {
|
||||
return urls.size();
|
||||
}
|
||||
|
||||
protected abstract void pushWhenNoDuplicate(Request request, Task task);
|
||||
}
|
Loading…
Reference in New Issue