Merge branch 'stable' of github.com:code4craft/webmagic
commit
feb604da87
@ -0,0 +1,70 @@
|
||||
package us.codecraft.webmagic.scheduler.component;
|
||||
|
||||
import com.google.common.hash.BloomFilter;
|
||||
import com.google.common.hash.Funnels;
|
||||
import us.codecraft.webmagic.Request;
|
||||
import us.codecraft.webmagic.Task;
|
||||
|
||||
import java.nio.charset.Charset;
|
||||
import java.util.concurrent.atomic.AtomicInteger;
|
||||
|
||||
/**
|
||||
* BloomFilterDuplicateRemover for huge number of urls.
|
||||
*
|
||||
* @author code4crafer@gmail.com
|
||||
* @since 0.5.1
|
||||
*/
|
||||
public class BloomFilterDuplicateRemover implements DuplicateRemover {
|
||||
|
||||
private int expectedInsertions;
|
||||
|
||||
private double fpp;
|
||||
|
||||
private AtomicInteger counter;
|
||||
|
||||
public BloomFilterDuplicateRemover(int expectedInsertions) {
|
||||
this(expectedInsertions, 0.01);
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param expectedInsertions the number of expected insertions to the constructed
|
||||
* @param fpp the desired false positive probability (must be positive and less than 1.0)
|
||||
*/
|
||||
public BloomFilterDuplicateRemover(int expectedInsertions, double fpp) {
|
||||
this.expectedInsertions = expectedInsertions;
|
||||
this.fpp = fpp;
|
||||
this.bloomFilter = rebuildBloomFilter();
|
||||
}
|
||||
|
||||
protected BloomFilter<CharSequence> rebuildBloomFilter() {
|
||||
counter = new AtomicInteger(0);
|
||||
return BloomFilter.create(Funnels.stringFunnel(Charset.defaultCharset()), expectedInsertions, fpp);
|
||||
}
|
||||
|
||||
private final BloomFilter<CharSequence> bloomFilter;
|
||||
|
||||
@Override
|
||||
public boolean isDuplicate(Request request, Task task) {
|
||||
boolean isDuplicate = bloomFilter.mightContain(getUrl(request));
|
||||
if (!isDuplicate) {
|
||||
bloomFilter.put(getUrl(request));
|
||||
counter.incrementAndGet();
|
||||
}
|
||||
return isDuplicate;
|
||||
}
|
||||
|
||||
protected String getUrl(Request request) {
|
||||
return request.getUrl();
|
||||
}
|
||||
|
||||
@Override
|
||||
public void resetDuplicateCheck(Task task) {
|
||||
rebuildBloomFilter();
|
||||
}
|
||||
|
||||
@Override
|
||||
public int getTotalRequestsCount(Task task) {
|
||||
return counter.get();
|
||||
}
|
||||
}
|
@ -0,0 +1,35 @@
|
||||
package us.codecraft.webmagic.scheduler.component;
|
||||
|
||||
import us.codecraft.webmagic.Request;
|
||||
import us.codecraft.webmagic.Task;
|
||||
|
||||
/**
|
||||
* Remove duplicate requests.
|
||||
* @author code4crafer@gmail.com
|
||||
* @since 0.5.1
|
||||
*/
|
||||
public interface DuplicateRemover {
|
||||
/**
|
||||
*
|
||||
* Check whether the request is duplicate.
|
||||
*
|
||||
* @param request
|
||||
* @param task
|
||||
* @return
|
||||
*/
|
||||
public boolean isDuplicate(Request request, Task task);
|
||||
|
||||
/**
|
||||
* Reset duplicate check.
|
||||
* @param task
|
||||
*/
|
||||
public void resetDuplicateCheck(Task task);
|
||||
|
||||
/**
|
||||
* Get TotalRequestsCount for monitor.
|
||||
* @param task
|
||||
* @return
|
||||
*/
|
||||
public int getTotalRequestsCount(Task task);
|
||||
|
||||
}
|
@ -0,0 +1,5 @@
|
||||
<html>
|
||||
<body>
|
||||
Component of scheduler.
|
||||
</body>
|
||||
</html>
|
@ -0,0 +1,80 @@
|
||||
package us.codecraft.webmagic.scheduler;
|
||||
|
||||
import org.junit.Ignore;
|
||||
import org.junit.Test;
|
||||
import us.codecraft.webmagic.Request;
|
||||
import us.codecraft.webmagic.scheduler.component.BloomFilterDuplicateRemover;
|
||||
import us.codecraft.webmagic.scheduler.component.DuplicateRemover;
|
||||
import us.codecraft.webmagic.scheduler.component.HashSetDuplicateRemover;
|
||||
|
||||
import static org.assertj.core.api.Assertions.assertThat;
|
||||
|
||||
/**
|
||||
* @author code4crafer@gmail.com
|
||||
*/
|
||||
public class BloomFilterDuplicateRemoverTest {
|
||||
|
||||
@Test
|
||||
public void testRemove() throws Exception {
|
||||
BloomFilterDuplicateRemover bloomFilterDuplicateRemover = new BloomFilterDuplicateRemover(10);
|
||||
boolean isDuplicate = bloomFilterDuplicateRemover.isDuplicate(new Request("a"), null);
|
||||
assertThat(isDuplicate).isFalse();
|
||||
isDuplicate = bloomFilterDuplicateRemover.isDuplicate(new Request("a"), null);
|
||||
assertThat(isDuplicate).isTrue();
|
||||
isDuplicate = bloomFilterDuplicateRemover.isDuplicate(new Request("b"), null);
|
||||
assertThat(isDuplicate).isFalse();
|
||||
isDuplicate = bloomFilterDuplicateRemover.isDuplicate(new Request("b"), null);
|
||||
assertThat(isDuplicate).isTrue();
|
||||
|
||||
}
|
||||
|
||||
@Ignore("long time")
|
||||
@Test
|
||||
public void testMemory() throws Exception {
|
||||
int times = 5000000;
|
||||
DuplicateRemover duplicateRemover = new BloomFilterDuplicateRemover(times,0.005);
|
||||
long freeMemory = Runtime.getRuntime().freeMemory();
|
||||
long time = System.currentTimeMillis();
|
||||
for (int i = 0; i < times; i++) {
|
||||
duplicateRemover.isDuplicate(new Request(String.valueOf(i)), null);
|
||||
}
|
||||
System.out.println("Time used by bloomfilter:" + (System.currentTimeMillis() - time));
|
||||
System.out.println("Memory used by bloomfilter:" + (freeMemory - Runtime.getRuntime().freeMemory()));
|
||||
|
||||
duplicateRemover = new HashSetDuplicateRemover();
|
||||
System.gc();
|
||||
freeMemory = Runtime.getRuntime().freeMemory();
|
||||
time = System.currentTimeMillis();
|
||||
for (int i = 0; i < times; i++) {
|
||||
duplicateRemover.isDuplicate(new Request(String.valueOf(i)), null);
|
||||
}
|
||||
System.out.println("Time used by hashset:" + (System.currentTimeMillis() - time));
|
||||
System.out.println("Memory used by hashset:" + (freeMemory - Runtime.getRuntime().freeMemory()));
|
||||
}
|
||||
|
||||
@Ignore("long time")
|
||||
@Test
|
||||
public void testMissHit() throws Exception {
|
||||
int times = 5000000;
|
||||
DuplicateRemover duplicateRemover = new BloomFilterDuplicateRemover(times, 0.01);
|
||||
int right = 0;
|
||||
int wrong = 0;
|
||||
int missCheck = 0;
|
||||
for (int i = 0; i < times; i++) {
|
||||
boolean duplicate = duplicateRemover.isDuplicate(new Request(String.valueOf(i)), null);
|
||||
if (duplicate) {
|
||||
wrong++;
|
||||
} else {
|
||||
right++;
|
||||
}
|
||||
duplicate = duplicateRemover.isDuplicate(new Request(String.valueOf(i)), null);
|
||||
if (!duplicate) {
|
||||
missCheck++;
|
||||
}
|
||||
}
|
||||
|
||||
System.out.println("Right count: " + right + " Wrong count: " + wrong + " Miss check: " + missCheck);
|
||||
}
|
||||
|
||||
|
||||
}
|
@ -0,0 +1,37 @@
|
||||
package us.codecraft.webmagic.samples;
|
||||
|
||||
/**
|
||||
* @author code4crafer@gmail.com
|
||||
*/
|
||||
public class GithubRepo {
|
||||
|
||||
private String name;
|
||||
|
||||
private String author;
|
||||
|
||||
private String readme;
|
||||
|
||||
public String getName() {
|
||||
return name;
|
||||
}
|
||||
|
||||
public void setName(String name) {
|
||||
this.name = name;
|
||||
}
|
||||
|
||||
public String getAuthor() {
|
||||
return author;
|
||||
}
|
||||
|
||||
public void setAuthor(String author) {
|
||||
this.author = author;
|
||||
}
|
||||
|
||||
public String getReadme() {
|
||||
return readme;
|
||||
}
|
||||
|
||||
public void setReadme(String readme) {
|
||||
this.readme = readme;
|
||||
}
|
||||
}
|
@ -0,0 +1,40 @@
|
||||
package us.codecraft.webmagic.samples;
|
||||
|
||||
import us.codecraft.webmagic.Page;
|
||||
import us.codecraft.webmagic.Site;
|
||||
import us.codecraft.webmagic.Spider;
|
||||
import us.codecraft.webmagic.processor.PageProcessor;
|
||||
|
||||
/**
|
||||
* @author code4crafter@gmail.com <br>
|
||||
* @since 0.5.1
|
||||
*/
|
||||
public class GithubRepoPageProcessor implements PageProcessor {
|
||||
|
||||
private Site site = Site.me().setRetryTimes(3).setSleepTime(0);
|
||||
|
||||
@Override
|
||||
public void process(Page page) {
|
||||
page.addTargetRequests(page.getHtml().links().regex("(https://github\\.com/\\w+/\\w+)").all());
|
||||
page.addTargetRequests(page.getHtml().links().regex("(https://github\\.com/\\w+)").all());
|
||||
GithubRepo githubRepo = new GithubRepo();
|
||||
githubRepo.setAuthor(page.getUrl().regex("https://github\\.com/(\\w+)/.*").toString());
|
||||
githubRepo.setName(page.getHtml().xpath("//h1[@class='entry-title public']/strong/a/text()").toString());
|
||||
githubRepo.setReadme(page.getHtml().xpath("//div[@id='readme']/tidyText()").toString());
|
||||
if (githubRepo.getName() == null) {
|
||||
//skip this page
|
||||
page.setSkip(true);
|
||||
} else {
|
||||
page.putField("repo", githubRepo);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public Site getSite() {
|
||||
return site;
|
||||
}
|
||||
|
||||
public static void main(String[] args) {
|
||||
Spider.create(new GithubRepoPageProcessor()).addUrl("https://github.com/code4craft").thread(5).run();
|
||||
}
|
||||
}
|
@ -0,0 +1,7 @@
|
||||
package us.codecraft.webmagic.samples.pipeline;
|
||||
|
||||
/**
|
||||
* @author code4crafer@gmail.com
|
||||
*/
|
||||
public class ReplacePipeline {
|
||||
}
|
Loading…
Reference in New Issue