add bloom filter for scheduler #118
parent
64293cba20
commit
d1140b9e29
@ -0,0 +1,61 @@
|
||||
package us.codecraft.webmagic.scheduler.component;
|
||||
|
||||
import com.google.common.hash.BloomFilter;
|
||||
import com.google.common.hash.Funnels;
|
||||
import us.codecraft.webmagic.Request;
|
||||
import us.codecraft.webmagic.Task;
|
||||
|
||||
import java.nio.charset.Charset;
|
||||
import java.util.concurrent.atomic.AtomicInteger;
|
||||
|
||||
/**
|
||||
* BloomFilterDuplicateRemover for huge number of urls.
|
||||
*
|
||||
* @author code4crafer@gmail.com
|
||||
* @since 0.5.1
|
||||
*/
|
||||
public class BloomFilterDuplicateRemover implements DuplicateRemover {
|
||||
|
||||
private int expectedInsertions;
|
||||
|
||||
private double fpp;
|
||||
|
||||
private AtomicInteger counter;
|
||||
|
||||
public BloomFilterDuplicateRemover(int expectedInsertions) {
|
||||
this(expectedInsertions, 0.03);
|
||||
}
|
||||
|
||||
public BloomFilterDuplicateRemover(int expectedInsertions, double fpp) {
|
||||
this.expectedInsertions = expectedInsertions;
|
||||
this.fpp = fpp;
|
||||
this.bloomFilter = rebuildBloomFilter();
|
||||
}
|
||||
|
||||
protected BloomFilter<CharSequence> rebuildBloomFilter() {
|
||||
counter = new AtomicInteger(0);
|
||||
return BloomFilter.create(Funnels.stringFunnel(Charset.defaultCharset()), expectedInsertions, fpp);
|
||||
}
|
||||
|
||||
private final BloomFilter<CharSequence> bloomFilter;
|
||||
|
||||
@Override
|
||||
public boolean isDuplicate(Request request, Task task) {
|
||||
boolean isDuplicate = bloomFilter.mightContain(request.getUrl());
|
||||
if (!isDuplicate) {
|
||||
bloomFilter.apply(request.getUrl());
|
||||
counter.incrementAndGet();
|
||||
}
|
||||
return isDuplicate;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void resetDuplicateCheck(Task task) {
|
||||
rebuildBloomFilter();
|
||||
}
|
||||
|
||||
@Override
|
||||
public int getTotalRequestsCount(Task task) {
|
||||
return counter.get();
|
||||
}
|
||||
}
|
@ -0,0 +1,35 @@
|
||||
package us.codecraft.webmagic.scheduler.component;
|
||||
|
||||
import us.codecraft.webmagic.Request;
|
||||
import us.codecraft.webmagic.Task;
|
||||
|
||||
/**
|
||||
* Remove duplicate requests.
|
||||
* @author code4crafer@gmail.com
|
||||
* @since 0.5.1
|
||||
*/
|
||||
public interface DuplicateRemover {
|
||||
/**
|
||||
*
|
||||
* Check whether the request is duplicate.
|
||||
*
|
||||
* @param request
|
||||
* @param task
|
||||
* @return
|
||||
*/
|
||||
public boolean isDuplicate(Request request, Task task);
|
||||
|
||||
/**
|
||||
* Reset duplicate check.
|
||||
* @param task
|
||||
*/
|
||||
public void resetDuplicateCheck(Task task);
|
||||
|
||||
/**
|
||||
* Get TotalRequestsCount for monitor.
|
||||
* @param task
|
||||
* @return
|
||||
*/
|
||||
public int getTotalRequestsCount(Task task);
|
||||
|
||||
}
|
@ -0,0 +1,5 @@
|
||||
<html>
|
||||
<body>
|
||||
Component of scheduler.
|
||||
</body>
|
||||
</html>
|
@ -0,0 +1,27 @@
|
||||
package us.codecraft.webmagic.scheduler;
|
||||
|
||||
import org.junit.Test;
|
||||
import us.codecraft.webmagic.Request;
|
||||
import us.codecraft.webmagic.scheduler.component.BloomFilterDuplicateRemover;
|
||||
|
||||
import static org.assertj.core.api.Assertions.assertThat;
|
||||
|
||||
/**
|
||||
* @author code4crafer@gmail.com
|
||||
*/
|
||||
public class BloomFilterDuplicateRemoverTest {
|
||||
|
||||
@Test
|
||||
public void testRemove() throws Exception {
|
||||
BloomFilterDuplicateRemover bloomFilterDuplicateRemover = new BloomFilterDuplicateRemover(10);
|
||||
boolean isDuplicate = bloomFilterDuplicateRemover.isDuplicate(new Request("a"), null);
|
||||
assertThat(isDuplicate).isFalse();
|
||||
isDuplicate = bloomFilterDuplicateRemover.isDuplicate(new Request("a"), null);
|
||||
assertThat(isDuplicate);
|
||||
isDuplicate = bloomFilterDuplicateRemover.isDuplicate(new Request("b"), null);
|
||||
assertThat(isDuplicate).isFalse();
|
||||
isDuplicate = bloomFilterDuplicateRemover.isDuplicate(new Request("b"), null);
|
||||
assertThat(isDuplicate);
|
||||
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue