update jar

pull/358/head
yihua.huang 12 years ago
commit 372cc0ad06

@ -22,12 +22,12 @@ Add dependencies to your pom.xml:
<dependency> <dependency>
<groupId>us.codecraft</groupId> <groupId>us.codecraft</groupId>
<artifactId>webmagic-core</artifactId> <artifactId>webmagic-core</artifactId>
<version>0.3.0</version> <version>0.3.1</version>
</dependency> </dependency>
<dependency> <dependency>
<groupId>us.codecraft</groupId> <groupId>us.codecraft</groupId>
<artifactId>webmagic-extension</artifactId> <artifactId>webmagic-extension</artifactId>
<version>0.3.0</version> <version>0.3.1</version>
</dependency> </dependency>
## Get Started: ## Get Started:

@ -1,6 +1,6 @@
#!/bin/sh #!/bin/sh
mvn clean package mvn clean package
cp ./webmagic-samples/target/webmagic-0.3.1.jar ./bin/ cp ./webmagic-samples/target/webmagic-0.3.2.jar ./bin/
rsync -avz --delete ./webmagic-samples/target/lib/ ./bin/lib/ rsync -avz --delete ./webmagic-samples/target/lib/ ./bin/lib/
git add . git add .
git add -u git add -u

@ -6,7 +6,7 @@
<version>7</version> <version>7</version>
</parent> </parent>
<groupId>us.codecraft</groupId> <groupId>us.codecraft</groupId>
<version>0.3.1</version> <version>0.3.2</version>
<modelVersion>4.0.0</modelVersion> <modelVersion>4.0.0</modelVersion>
<packaging>pom</packaging> <packaging>pom</packaging>
<properties> <properties>

@ -3,7 +3,7 @@
<parent> <parent>
<groupId>us.codecraft</groupId> <groupId>us.codecraft</groupId>
<artifactId>webmagic-parent</artifactId> <artifactId>webmagic-parent</artifactId>
<version>0.3.1</version> <version>0.3.2</version>
</parent> </parent>
<modelVersion>4.0.0</modelVersion> <modelVersion>4.0.0</modelVersion>

@ -96,11 +96,6 @@ public class Site {
* @return get domain * @return get domain
*/ */
public String getDomain() { public String getDomain() {
if (domain == null) {
if (startUrls.size() > 0) {
domain = UrlUtils.getDomain(startUrls.get(0));
}
}
return domain; return domain;
} }
@ -176,6 +171,11 @@ public class Site {
*/ */
public Site addStartUrl(String startUrl) { public Site addStartUrl(String startUrl) {
this.startUrls.add(startUrl); this.startUrls.add(startUrl);
if (domain == null) {
if (startUrls.size() > 0) {
domain = UrlUtils.getDomain(startUrls.get(0));
}
}
return this; return this;
} }

@ -21,22 +21,27 @@ import java.util.concurrent.atomic.AtomicInteger;
/** /**
* Entrance of a crawler.<br> * Entrance of a crawler.<br>
* A spider contains four modules: Downloader, Scheduler, PageProcessor and Pipeline.<br> * A spider contains four modules: Downloader, Scheduler, PageProcessor and
* Every module is a field of Spider. <br> * Pipeline.<br>
* The modules are defined in interface. <br> * Every module is a field of Spider. <br>
* You can customize a spider with various implementations of them. <br> * The modules are defined in interface. <br>
* Examples: <br> * You can customize a spider with various implementations of them. <br>
* Examples: <br>
* <br> * <br>
* A simple crawler: <br> * A simple crawler: <br>
* Spider.create(new SimplePageProcessor("http://my.oschina.net/", "http://my.oschina.net/*blog/*")).run();<br> * Spider.create(new SimplePageProcessor("http://my.oschina.net/",
* "http://my.oschina.net/*blog/*")).run();<br>
* <br> * <br>
* Store results to files by FilePipeline: <br> * Store results to files by FilePipeline: <br>
* Spider.create(new SimplePageProcessor("http://my.oschina.net/", "http://my.oschina.net/*blog/*")) <br> * Spider.create(new SimplePageProcessor("http://my.oschina.net/",
* .pipeline(new FilePipeline("/data/temp/webmagic/")).run(); <br> * "http://my.oschina.net/*blog/*")) <br>
* .pipeline(new FilePipeline("/data/temp/webmagic/")).run(); <br>
* <br> * <br>
* Use FileCacheQueueScheduler to store urls and cursor in files, so that a Spider can resume the status when shutdown. <br> * Use FileCacheQueueScheduler to store urls and cursor in files, so that a
* Spider.create(new SimplePageProcessor("http://my.oschina.net/", "http://my.oschina.net/*blog/*")) <br> * Spider can resume the status when shutdown. <br>
* .scheduler(new FileCacheQueueScheduler("/data/temp/webmagic/cache/")).run(); <br> * Spider.create(new SimplePageProcessor("http://my.oschina.net/",
* "http://my.oschina.net/*blog/*")) <br>
* .scheduler(new FileCacheQueueScheduler("/data/temp/webmagic/cache/")).run(); <br>
* *
* @author code4crafter@gmail.com <br> * @author code4crafter@gmail.com <br>
* @see Downloader * @see Downloader
@ -47,372 +52,381 @@ import java.util.concurrent.atomic.AtomicInteger;
*/ */
public class Spider implements Runnable, Task { public class Spider implements Runnable, Task {
protected Downloader downloader; protected Downloader downloader;
protected List<Pipeline> pipelines = new ArrayList<Pipeline>(); protected List<Pipeline> pipelines = new ArrayList<Pipeline>();
protected PageProcessor pageProcessor; protected PageProcessor pageProcessor;
protected List<String> startUrls; protected List<String> startUrls;
protected Site site; protected Site site;
protected String uuid; protected String uuid;
protected Scheduler scheduler = new QueueScheduler(); protected Scheduler scheduler = new QueueScheduler();
protected Logger logger = Logger.getLogger(getClass()); protected Logger logger = Logger.getLogger(getClass());
protected ExecutorService executorService; protected ExecutorService executorService;
protected int threadNum = 1; protected int threadNum = 1;
protected AtomicInteger stat = new AtomicInteger(STAT_INIT); protected AtomicInteger stat = new AtomicInteger(STAT_INIT);
protected final static int STAT_INIT = 0; protected final static int STAT_INIT = 0;
protected final static int STAT_RUNNING = 1; protected final static int STAT_RUNNING = 1;
protected final static int STAT_STOPPED = 2; protected final static int STAT_STOPPED = 2;
/** /**
* create a spider with pageProcessor. * create a spider with pageProcessor.
* *
* @param pageProcessor * @param pageProcessor
* @return new spider * @return new spider
* @see PageProcessor * @see PageProcessor
*/ */
public static Spider create(PageProcessor pageProcessor) { public static Spider create(PageProcessor pageProcessor) {
return new Spider(pageProcessor); return new Spider(pageProcessor);
} }
/** /**
* create a spider with pageProcessor. * create a spider with pageProcessor.
* *
* @param pageProcessor * @param pageProcessor
*/ */
public Spider(PageProcessor pageProcessor) { public Spider(PageProcessor pageProcessor) {
this.pageProcessor = pageProcessor; this.pageProcessor = pageProcessor;
this.site = pageProcessor.getSite(); this.site = pageProcessor.getSite();
this.startUrls = pageProcessor.getSite().getStartUrls(); this.startUrls = pageProcessor.getSite().getStartUrls();
} }
/** /**
* Set startUrls of Spider.<br> * Set startUrls of Spider.<br>
* Prior to startUrls of Site. * Prior to startUrls of Site.
* *
* @param startUrls * @param startUrls
* @return this * @return this
*/ */
public Spider startUrls(List<String> startUrls) { public Spider startUrls(List<String> startUrls) {
checkIfRunning(); checkIfRunning();
this.startUrls = startUrls; this.startUrls = startUrls;
return this; return this;
} }
/** /**
* Set an uuid for spider.<br> * Set an uuid for spider.<br>
* Default uuid is domain of site.<br> * Default uuid is domain of site.<br>
* *
* @param uuid * @param uuid
* @return this * @return this
*/ */
public Spider setUUID(String uuid) { public Spider setUUID(String uuid) {
this.uuid = uuid; this.uuid = uuid;
return this; return this;
} }
/** /**
* set scheduler for Spider * set scheduler for Spider
* *
* @param scheduler * @param scheduler
* @return this * @return this
* @Deprecated * @Deprecated
* @see #setScheduler(us.codecraft.webmagic.scheduler.Scheduler) * @see #setScheduler(us.codecraft.webmagic.scheduler.Scheduler)
*/ */
public Spider scheduler(Scheduler scheduler) { public Spider scheduler(Scheduler scheduler) {
return setScheduler(scheduler); return setScheduler(scheduler);
} }
/** /**
* set scheduler for Spider * set scheduler for Spider
* *
* @param scheduler * @param scheduler
* @return this * @return this
* @see Scheduler * @see Scheduler
* @since 0.2.1 * @since 0.2.1
*/ */
public Spider setScheduler(Scheduler scheduler) { public Spider setScheduler(Scheduler scheduler) {
checkIfRunning(); checkIfRunning();
this.scheduler = scheduler; this.scheduler = scheduler;
return this; return this;
} }
/** /**
* add a pipeline for Spider * add a pipeline for Spider
* *
* @param pipeline * @param pipeline
* @return this * @return this
* @see #setPipeline(us.codecraft.webmagic.pipeline.Pipeline) * @see #setPipeline(us.codecraft.webmagic.pipeline.Pipeline)
* @deprecated * @deprecated
*/ */
public Spider pipeline(Pipeline pipeline) { public Spider pipeline(Pipeline pipeline) {
return addPipeline(pipeline); return addPipeline(pipeline);
} }
/** /**
* add a pipeline for Spider * add a pipeline for Spider
* *
* @param pipeline * @param pipeline
* @return this * @return this
* @see Pipeline * @see Pipeline
* @since 0.2.1 * @since 0.2.1
*/ */
public Spider addPipeline(Pipeline pipeline) { public Spider addPipeline(Pipeline pipeline) {
checkIfRunning(); checkIfRunning();
this.pipelines.add(pipeline); this.pipelines.add(pipeline);
return this; return this;
} }
/** /**
* clear the pipelines set * clear the pipelines set
* *
* @return this * @return this
*/ */
public Spider clearPipeline() { public Spider clearPipeline() {
pipelines = new ArrayList<Pipeline>(); pipelines = new ArrayList<Pipeline>();
return this; return this;
} }
/** /**
* set the downloader of spider * set the downloader of spider
* *
* @param downloader * @param downloader
* @return this * @return this
* @see #setDownloader(us.codecraft.webmagic.downloader.Downloader) * @see #setDownloader(us.codecraft.webmagic.downloader.Downloader)
* @deprecated * @deprecated
*/ */
public Spider downloader(Downloader downloader) { public Spider downloader(Downloader downloader) {
return setDownloader(downloader); return setDownloader(downloader);
} }
/** /**
* set the downloader of spider * set the downloader of spider
* *
* @param downloader * @param downloader
* @return this * @return this
* @see Downloader * @see Downloader
*/ */
public Spider setDownloader(Downloader downloader) { public Spider setDownloader(Downloader downloader) {
checkIfRunning(); checkIfRunning();
this.downloader = downloader; this.downloader = downloader;
return this; return this;
} }
protected void checkComponent() { protected void checkComponent() {
if (downloader == null) { if (downloader == null) {
this.downloader = new HttpClientDownloader(); this.downloader = new HttpClientDownloader();
} }
if (pipelines.isEmpty()) { if (pipelines.isEmpty()) {
pipelines.add(new ConsolePipeline()); pipelines.add(new ConsolePipeline());
} }
downloader.setThread(threadNum); downloader.setThread(threadNum);
} }
@Override @Override
public void run() { public void run() {
if (!stat.compareAndSet(STAT_INIT, STAT_RUNNING) if (!stat.compareAndSet(STAT_INIT, STAT_RUNNING) && !stat.compareAndSet(STAT_STOPPED, STAT_RUNNING)) {
&& !stat.compareAndSet(STAT_STOPPED, STAT_RUNNING)) { throw new IllegalStateException("Spider is already running!");
throw new IllegalStateException("Spider is already running!"); }
} checkComponent();
checkComponent(); if (startUrls != null) {
if (startUrls != null) { for (String startUrl : startUrls) {
for (String startUrl : startUrls) { scheduler.push(new Request(startUrl), this);
scheduler.push(new Request(startUrl), this); }
} startUrls.clear();
startUrls.clear(); }
} Request request = scheduler.poll(this);
Request request = scheduler.poll(this); logger.info("Spider " + getUUID() + " started!");
//single thread // single thread
if (executorService == null) { if (threadNum <= 1) {
while (request != null && stat.compareAndSet(STAT_RUNNING, STAT_RUNNING)) { while (request != null && stat.compareAndSet(STAT_RUNNING, STAT_RUNNING)) {
processRequest(request); processRequest(request);
request = scheduler.poll(this); request = scheduler.poll(this);
} }
} else { } else {
//multi thread synchronized (this) {
final AtomicInteger threadAlive = new AtomicInteger(0); this.executorService = ThreadUtils.newFixedThreadPool(threadNum);
while (true && stat.compareAndSet(STAT_RUNNING, STAT_RUNNING)) { }
if (request == null) { // multi thread
//when no request found but some thread is alive, sleep a while. final AtomicInteger threadAlive = new AtomicInteger(0);
try { while (true && stat.compareAndSet(STAT_RUNNING, STAT_RUNNING)) {
Thread.sleep(100); if (request == null) {
} catch (InterruptedException e) { // when no request found but some thread is alive, sleep a
} // while.
} else { try {
final Request requestFinal = request; Thread.sleep(100);
threadAlive.incrementAndGet(); } catch (InterruptedException e) {
executorService.execute(new Runnable() { }
@Override } else {
public void run() { final Request requestFinal = request;
processRequest(requestFinal); threadAlive.incrementAndGet();
threadAlive.decrementAndGet(); executorService.execute(new Runnable() {
} @Override
}); public void run() {
} processRequest(requestFinal);
request = scheduler.poll(this); threadAlive.decrementAndGet();
if (threadAlive.get() == 0) { }
request = scheduler.poll(this); });
if (request == null) { }
break; request = scheduler.poll(this);
} if (threadAlive.get() == 0) {
} request = scheduler.poll(this);
} if (request == null) {
executorService.shutdown(); break;
} }
stat.compareAndSet(STAT_RUNNING, STAT_STOPPED); }
//release some resources }
destroy(); executorService.shutdown();
} }
stat.compareAndSet(STAT_RUNNING, STAT_STOPPED);
protected void destroy() { // release some resources
destroyEach(downloader); destroy();
destroyEach(pageProcessor); }
for (Pipeline pipeline : pipelines) {
destroyEach(pipeline); protected void destroy() {
} destroyEach(downloader);
} destroyEach(pageProcessor);
for (Pipeline pipeline : pipelines) {
private void destroyEach(Object object) { destroyEach(pipeline);
if (object instanceof Closeable) { }
try { }
((Closeable) object).close();
} catch (IOException e) { private void destroyEach(Object object) {
e.printStackTrace(); if (object instanceof Closeable) {
} try {
} ((Closeable) object).close();
} } catch (IOException e) {
e.printStackTrace();
/** }
* Process specific urls without url discovering. }
* }
* @param urls urls to process
*/ /**
public void test(String... urls) { * Process specific urls without url discovering.
checkComponent(); *
if (urls.length > 0) { * @param urls
for (String url : urls) { * urls to process
processRequest(new Request(url)); */
} public void test(String... urls) {
} checkComponent();
} if (urls.length > 0) {
for (String url : urls) {
protected void processRequest(Request request) { processRequest(new Request(url));
Page page = downloader.download(request, this); }
if (page == null) { }
sleep(site.getSleepTime()); }
return;
} protected void processRequest(Request request) {
//for cycle retry Page page = downloader.download(request, this);
if (page.getHtml() == null) { if (page == null) {
addRequest(page); sleep(site.getSleepTime());
sleep(site.getSleepTime()); return;
return; }
} // for cycle retry
pageProcessor.process(page); if (page.getHtml() == null) {
addRequest(page); addRequest(page);
if (!page.getResultItems().isSkip()) { sleep(site.getSleepTime());
for (Pipeline pipeline : pipelines) { return;
pipeline.process(page.getResultItems(), this); }
} pageProcessor.process(page);
} addRequest(page);
sleep(site.getSleepTime()); if (!page.getResultItems().isSkip()) {
} for (Pipeline pipeline : pipelines) {
pipeline.process(page.getResultItems(), this);
protected void sleep(int time) { }
try { }
Thread.sleep(time); sleep(site.getSleepTime());
} catch (InterruptedException e) { }
e.printStackTrace();
} protected void sleep(int time) {
} try {
Thread.sleep(time);
protected void addRequest(Page page) { } catch (InterruptedException e) {
if (CollectionUtils.isNotEmpty(page.getTargetRequests())) { e.printStackTrace();
for (Request request : page.getTargetRequests()) { }
scheduler.push(request, this); }
}
} protected void addRequest(Page page) {
} if (CollectionUtils.isNotEmpty(page.getTargetRequests())) {
for (Request request : page.getTargetRequests()) {
protected void checkIfRunning() { scheduler.push(request, this);
if (!stat.compareAndSet(STAT_INIT, STAT_INIT) && !stat.compareAndSet(STAT_STOPPED, STAT_STOPPED)) { }
throw new IllegalStateException("Spider is already running!"); }
} }
}
protected void checkIfRunning() {
public void runAsync() { if (!stat.compareAndSet(STAT_INIT, STAT_INIT) && !stat.compareAndSet(STAT_STOPPED, STAT_STOPPED)) {
Thread thread = new Thread(this); throw new IllegalStateException("Spider is already running!");
thread.setDaemon(false); }
thread.start(); }
}
public void runAsync() {
public void start() { Thread thread = new Thread(this);
runAsync(); thread.setDaemon(false);
} thread.start();
}
public void stop() {
stat.compareAndSet(STAT_RUNNING, STAT_STOPPED); public void start() {
} runAsync();
}
public void stopAndDestroy() {
stat.compareAndSet(STAT_RUNNING, STAT_STOPPED); public void stop() {
destroy(); if (stat.compareAndSet(STAT_RUNNING, STAT_STOPPED)) {
} if (executorService != null) {
executorService.shutdown();
/** }
* start with more than one threads logger.info("Spider " + getUUID() + " stop success!");
* } else {
* @param threadNum logger.info("Spider " + getUUID() + " stop fail!");
* @return this }
*/ }
public Spider thread(int threadNum) {
checkIfRunning(); public void stopAndDestroy() {
this.threadNum = threadNum; stop();
if (threadNum <= 0) { destroy();
throw new IllegalArgumentException("threadNum should be more than one!"); }
}
if (threadNum == 1) { /**
return this; * start with more than one threads
} *
synchronized (this) { * @param threadNum
this.executorService = ThreadUtils.newFixedThreadPool(threadNum); * @return this
} */
return this; public Spider thread(int threadNum) {
} checkIfRunning();
this.threadNum = threadNum;
/** if (threadNum <= 0) {
* switch off xsoup throw new IllegalArgumentException("threadNum should be more than one!");
* }
* @return if (threadNum == 1) {
*/ return this;
public static void xsoupOff() { }
EnvironmentUtil.setUseXsoup(false); return this;
} }
@Override /**
public String getUUID() { * switch off xsoup
if (uuid != null) { *
return uuid; * @return
} */
if (site != null) { public static void xsoupOff() {
return site.getDomain(); EnvironmentUtil.setUseXsoup(false);
} }
return null;
} @Override
public String getUUID() {
@Override if (uuid != null) {
public Site getSite() { return uuid;
return site; }
} if (site != null) {
return site.getDomain();
}
return null;
}
@Override
public Site getSite() {
return site;
}
} }

@ -0,0 +1,36 @@
package us.codecraft.webmagic.processor.example;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.processor.PageProcessor;
/**
* @author code4crafter@gmail.com <br>
* @since 0.3.2
*/
public class GithubRepoPageProcesser implements PageProcessor {
private Site site = Site.me().addStartUrl("https://github.com/code4craft").setRetryTimes(3).setSleepTime(100);
@Override
public void process(Page page) {
page.addTargetRequests(page.getHtml().links().regex("(https://github\\.com/\\w+/\\w+)").all());
page.putField("author", page.getUrl().regex("https://github\\.com/(\\w+)/.*").toString());
page.putField("name", page.getHtml().xpath("//h1[@class='entry-title public']/strong/a/text()").toString());
if (page.getResultItems().get("name")==null){
//skip this page
page.setSkip(true);
}
page.putField("readme", page.getHtml().xpath("//div[@id='readme']/tidyText()"));
}
@Override
public Site getSite() {
return site;
}
public static void main(String[] args) {
Spider.create(new GithubRepoPageProcesser()).thread(5).run();
}
}

@ -0,0 +1,39 @@
package us.codecraft.webmagic.processor.example;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.processor.PageProcessor;
import java.util.List;
/**
* @author code4crafter@gmail.com <br>
*/
public class OschinaBlogPageProcesser implements PageProcessor {
private Site site = Site.me().setDomain("my.oschina.net").addStartUrl("http://my.oschina.net/flashsword/blog");
@Override
public void process(Page page) {
List<String> links = page.getHtml().links().regex("http://my\\.oschina\\.net/flashsword/blog/\\d+").all();
page.addTargetRequests(links);
page.putField("title", page.getHtml().xpath("//div[@class='BlogEntity']/div[@class='BlogTitle']/h1/text()").toString());
if (page.getResultItems().get("title") == null) {
//skip this page
page.setSkip(true);
}
page.putField("content", page.getHtml().xpath("//div[@class='BlogContent']/tidyText()").toString());
page.putField("tags", page.getHtml().xpath("//div[@class='BlogTags']/a/text()").all());
}
@Override
public Site getSite() {
return site;
}
public static void main(String[] args) {
Spider.create(new OschinaBlogPageProcesser()).thread(2).run();
}
}

@ -44,9 +44,4 @@ public abstract class Selectors {
return new OrSelector(selectors); return new OrSelector(selectors);
} }
public static void main(String[] args) {
String s = "a";
or(regex("<title>(.*)</title>"), xpath("//title"), $("title")).select(s);
}
} }

@ -1,7 +1,7 @@
package us.codecraft.webmagic.utils; package us.codecraft.webmagic.utils;
import java.util.concurrent.ExecutorService; import java.util.concurrent.ExecutorService;
import java.util.concurrent.LinkedBlockingQueue; import java.util.concurrent.SynchronousQueue;
import java.util.concurrent.ThreadPoolExecutor; import java.util.concurrent.ThreadPoolExecutor;
import java.util.concurrent.TimeUnit; import java.util.concurrent.TimeUnit;
@ -11,22 +11,11 @@ import java.util.concurrent.TimeUnit;
*/ */
public class ThreadUtils { public class ThreadUtils {
public static ExecutorService newFixedThreadPool(int threadSize) { public static ExecutorService newFixedThreadPool(int threadSize) {
return new ThreadPoolExecutor(threadSize, threadSize, 0L, TimeUnit.MILLISECONDS, if (threadSize <= 1) {
new LinkedBlockingQueue<Runnable>(1) { throw new IllegalArgumentException("ThreadSize must be greater than 1!");
}
private static final long serialVersionUID = -9028058603126367678L; return new ThreadPoolExecutor(threadSize - 1, threadSize - 1, 0L, TimeUnit.MILLISECONDS,
new SynchronousQueue<Runnable>(), new ThreadPoolExecutor.CallerRunsPolicy());
@Override }
public boolean offer(Runnable e) {
try {
put(e);
return true;
} catch (InterruptedException ie) {
Thread.currentThread().interrupt();
}
return false;
}
});
}
} }

@ -16,8 +16,6 @@ import java.util.regex.Pattern;
*/ */
public class UrlUtils { public class UrlUtils {
private static Pattern relativePathPattern = Pattern.compile("^([\\.]+)/");
/** /**
* canonicalizeUrl * canonicalizeUrl
* *

@ -18,11 +18,12 @@ public class SpiderTest {
public void process(ResultItems resultItems, Task task) { public void process(ResultItems resultItems, Task task) {
System.out.println(1); System.out.println(1);
} }
}); }).thread(2);
spider.start(); spider.start();
Thread.sleep(10000); Thread.sleep(10000);
spider.stop(); spider.stop();
// spider.run(); Thread.sleep(10000);
spider.start();
Thread.sleep(10000); Thread.sleep(10000);
} }
} }

@ -3,7 +3,7 @@
<parent> <parent>
<groupId>us.codecraft</groupId> <groupId>us.codecraft</groupId>
<artifactId>webmagic-parent</artifactId> <artifactId>webmagic-parent</artifactId>
<version>0.3.1</version> <version>0.3.2</version>
</parent> </parent>
<modelVersion>4.0.0</modelVersion> <modelVersion>4.0.0</modelVersion>

@ -1,10 +1,9 @@
package us.codecraft.webmagic.model; package us.codecraft.webmagic.example;
import junit.framework.Assert;
import org.junit.Test;
import us.codecraft.webmagic.MockDownloader;
import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Task; import us.codecraft.webmagic.model.ConsolePageModelPipeline;
import us.codecraft.webmagic.model.HasKey;
import us.codecraft.webmagic.model.OOSpider;
import us.codecraft.webmagic.model.annotation.ExtractBy; import us.codecraft.webmagic.model.annotation.ExtractBy;
import us.codecraft.webmagic.model.annotation.ExtractByUrl; import us.codecraft.webmagic.model.annotation.ExtractByUrl;
import us.codecraft.webmagic.model.annotation.HelpUrl; import us.codecraft.webmagic.model.annotation.HelpUrl;
@ -14,6 +13,7 @@ import java.util.List;
/** /**
* @author code4crafter@gmail.com <br> * @author code4crafter@gmail.com <br>
* @since 0.3.2
*/ */
@TargetUrl("https://github.com/\\w+/\\w+") @TargetUrl("https://github.com/\\w+/\\w+")
@HelpUrl({"https://github.com/\\w+\\?tab=repositories", "https://github.com/\\w+", "https://github.com/explore/*"}) @HelpUrl({"https://github.com/\\w+\\?tab=repositories", "https://github.com/\\w+", "https://github.com/explore/*"})
@ -25,31 +25,24 @@ public class GithubRepo implements HasKey {
@ExtractByUrl("https://github\\.com/(\\w+)/.*") @ExtractByUrl("https://github\\.com/(\\w+)/.*")
private String author; private String author;
@ExtractBy("//div[@id='readme']") @ExtractBy("//div[@id='readme']/tidyText()")
private String readme; private String readme;
@ExtractBy(value = "//div[@class='repository-lang-stats']//li//span[@class='lang']", multi = true) @ExtractBy(value = "//div[@class='repository-lang-stats']//li//span[@class='lang']/text()", multi = true)
private List<String> language; private List<String> language;
@ExtractBy("//ul[@class='pagehead-actions']/li[2]//a[@class='social-count js-social-count']/text()") @ExtractBy("//ul[@class='pagehead-actions']/li[1]//a[@class='social-count js-social-count']/text()")
private String star; private int star;
@ExtractBy("//ul[@class='pagehead-actions']/li[3]//a[@class='social-count']/text()") @ExtractBy("//ul[@class='pagehead-actions']/li[2]//a[@class='social-count']/text()")
private String fork; private int fork;
@ExtractByUrl @ExtractByUrl
private String url; private String url;
@Test public static void main(String[] args) {
public void test() { OOSpider.create(Site.me().addStartUrl("https://github.com/code4craft").setSleepTime(100)
OOSpider.create(Site.me().addStartUrl("https://github.com/code4craft/webmagic").setSleepTime(0) , new ConsolePageModelPipeline(), GithubRepo.class).thread(10).run();
, new PageModelPipeline<GithubRepo>() {
@Override
public void process(GithubRepo o, Task task) {
Assert.assertEquals("78",o.getStar().trim());
Assert.assertEquals("65",o.getFork().trim());
}
}, GithubRepo.class).setDownloader(new MockDownloader()).test("https://github.com/code4craft/webmagic");
} }
@Override @Override
@ -77,11 +70,11 @@ public class GithubRepo implements HasKey {
return url; return url;
} }
public String getStar() { public int getStar() {
return star; return star;
} }
public String getFork() { public int getFork() {
return fork; return fork;
} }
} }

@ -0,0 +1,54 @@
package us.codecraft.webmagic.example;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.model.OOSpider;
import us.codecraft.webmagic.model.annotation.ExtractBy;
import us.codecraft.webmagic.model.annotation.Formatter;
import us.codecraft.webmagic.model.annotation.TargetUrl;
import us.codecraft.webmagic.pipeline.JsonFilePageModelPipeline;
import java.util.Date;
import java.util.List;
/**
* @author code4crafter@gmail.com <br>
* @since 0.3.2
*/
@TargetUrl("http://my.oschina.net/flashsword/blog/\\d+")
public class OschinaBlog {
@ExtractBy("//title/text()")
private String title;
@ExtractBy(value = "div.BlogContent", type = ExtractBy.Type.Css)
private String content;
@ExtractBy(value = "//div[@class='BlogTags']/a/text()", multi = true)
private List<String> tags;
@Formatter("yyyy-MM-dd HH:mm")
@ExtractBy("//div[@class='BlogStat']/regex('\\d+-\\d+-\\d+\\s+\\d+:\\d+')")
private Date date;
public static void main(String[] args) {
OOSpider.create(Site.me().addStartUrl("http://my.oschina.net/flashsword/blog")
, new JsonFilePageModelPipeline("/data/webmagic/"), OschinaBlog.class).run();
}
public String getTitle() {
return title;
}
public String getContent() {
return content;
}
public List<String> getTags() {
return tags;
}
public Date getDate() {
return date;
}
}

@ -1,5 +1,6 @@
package us.codecraft.webmagic.model; package us.codecraft.webmagic.model;
import us.codecraft.webmagic.model.formatter.ObjectFormatter;
import us.codecraft.webmagic.selector.Selector; import us.codecraft.webmagic.selector.Selector;
import java.lang.reflect.Field; import java.lang.reflect.Field;
@ -16,6 +17,8 @@ class FieldExtractor extends Extractor {
private Method setterMethod; private Method setterMethod;
private ObjectFormatter objectFormatter;
public FieldExtractor(Field field, Selector selector, Source source, boolean notNull, boolean multi) { public FieldExtractor(Field field, Selector selector, Source source, boolean notNull, boolean multi) {
super(selector, source, notNull, multi); super(selector, source, notNull, multi);
this.field = field; this.field = field;
@ -44,4 +47,12 @@ class FieldExtractor extends Extractor {
boolean isNotNull() { boolean isNotNull() {
return notNull; return notNull;
} }
ObjectFormatter getObjectFormatter() {
return objectFormatter;
}
void setObjectFormatter(ObjectFormatter objectFormatter) {
this.objectFormatter = objectFormatter;
}
} }

@ -1,8 +1,12 @@
package us.codecraft.webmagic.model; package us.codecraft.webmagic.model;
import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.StringUtils;
import org.apache.log4j.Logger;
import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.model.annotation.*; import us.codecraft.webmagic.model.annotation.*;
import us.codecraft.webmagic.model.formatter.BasicTypeFormatter;
import us.codecraft.webmagic.model.formatter.ObjectFormatter;
import us.codecraft.webmagic.model.formatter.ObjectFormatters;
import us.codecraft.webmagic.selector.*; import us.codecraft.webmagic.selector.*;
import us.codecraft.webmagic.utils.ExtractorUtils; import us.codecraft.webmagic.utils.ExtractorUtils;
@ -36,6 +40,8 @@ class PageModelExtractor {
private Extractor objectExtractor; private Extractor objectExtractor;
private Logger logger = Logger.getLogger(getClass());
public static PageModelExtractor create(Class clazz) { public static PageModelExtractor create(Class clazz) {
PageModelExtractor pageModelExtractor = new PageModelExtractor(); PageModelExtractor pageModelExtractor = new PageModelExtractor();
pageModelExtractor.init(clazz); pageModelExtractor.init(clazz);
@ -62,16 +68,61 @@ class PageModelExtractor {
fieldExtractor = fieldExtractorTmp; fieldExtractor = fieldExtractorTmp;
} }
if (fieldExtractor != null) { if (fieldExtractor != null) {
if (!fieldExtractor.isMulti() && !String.class.isAssignableFrom(field.getType())) { checkFormat(field, fieldExtractor);
throw new IllegalStateException("Field " + field.getName() + " must be string");
} else if (fieldExtractor.isMulti() && !List.class.isAssignableFrom(field.getType())) {
throw new IllegalStateException("Field " + field.getName() + " must be list");
}
fieldExtractors.add(fieldExtractor); fieldExtractors.add(fieldExtractor);
} }
} }
} }
private void checkFormat(Field field, FieldExtractor fieldExtractor) {
if (!fieldExtractor.isMulti() && !String.class.isAssignableFrom(field.getType())) {
Class<?> fieldClazz = BasicTypeFormatter.detectBasicClass(field.getType());
ObjectFormatter objectFormatter = getObjectFormatter(field, fieldClazz);
if (objectFormatter == null) {
throw new IllegalStateException("Can't find formatter for field " + field.getName() + " of type " + fieldClazz);
} else {
fieldExtractor.setObjectFormatter(objectFormatter);
}
} else if (fieldExtractor.isMulti()) {
if (!List.class.isAssignableFrom(field.getType())) {
throw new IllegalStateException("Field " + field.getName() + " must be list");
}
Formatter formatter = field.getAnnotation(Formatter.class);
if (formatter != null) {
if (!formatter.subClazz().equals(Void.class)) {
ObjectFormatter objectFormatter = getObjectFormatter(field, formatter.subClazz());
if (objectFormatter == null) {
throw new IllegalStateException("Can't find formatter for field " + field.getName() + " of type " + formatter.subClazz());
} else {
fieldExtractor.setObjectFormatter(objectFormatter);
}
}
}
}
}
private ObjectFormatter getObjectFormatter(Field field, Class<?> fieldClazz) {
Formatter formatter = field.getAnnotation(Formatter.class);
if (formatter != null) {
if (!formatter.formatter().equals(ObjectFormatter.class)) {
ObjectFormatter objectFormatter = initFormatter(formatter.formatter());
objectFormatter.initParam(formatter.value());
}
}
return initFormatter(ObjectFormatters.get(fieldClazz));
}
private ObjectFormatter initFormatter(Class<? extends ObjectFormatter> formatterClazz) {
try {
return formatterClazz.newInstance();
} catch (InstantiationException e) {
logger.error("init ObjectFormatter fail", e);
} catch (IllegalAccessException e) {
logger.error("init ObjectFormatter fail", e);
}
return null;
}
private FieldExtractor getAnnotationExtractByUrl(Class clazz, Field field) { private FieldExtractor getAnnotationExtractByUrl(Class clazz, Field field) {
FieldExtractor fieldExtractor = null; FieldExtractor fieldExtractor = null;
ExtractByUrl extractByUrl = field.getAnnotation(ExtractByUrl.class); ExtractByUrl extractByUrl = field.getAnnotation(ExtractByUrl.class);
@ -231,7 +282,12 @@ class PageModelExtractor {
if ((value == null || value.size() == 0) && fieldExtractor.isNotNull()) { if ((value == null || value.size() == 0) && fieldExtractor.isNotNull()) {
return null; return null;
} }
setField(o, fieldExtractor, value); if (fieldExtractor.getObjectFormatter() != null) {
List<Object> converted = convert(value, fieldExtractor.getObjectFormatter());
setField(o, fieldExtractor, converted);
} else {
setField(o, fieldExtractor, value);
}
} else { } else {
String value; String value;
switch (fieldExtractor.getSource()) { switch (fieldExtractor.getSource()) {
@ -254,23 +310,58 @@ class PageModelExtractor {
if (value == null && fieldExtractor.isNotNull()) { if (value == null && fieldExtractor.isNotNull()) {
return null; return null;
} }
setField(o, fieldExtractor, value); if (fieldExtractor.getObjectFormatter() != null) {
Object converted = convert(value, fieldExtractor.getObjectFormatter());
if (converted == null && fieldExtractor.isNotNull()) {
return null;
}
setField(o, fieldExtractor, converted);
} else {
setField(o, fieldExtractor, value);
}
} }
} }
if (AfterExtractor.class.isAssignableFrom(clazz)) { if (AfterExtractor.class.isAssignableFrom(clazz)) {
((AfterExtractor) o).afterProcess(page); ((AfterExtractor) o).afterProcess(page);
} }
} catch (InstantiationException e) { } catch (InstantiationException e) {
e.printStackTrace(); logger.error("extract fail", e);
} catch (IllegalAccessException e) { } catch (IllegalAccessException e) {
e.printStackTrace(); logger.error("extract fail", e);
} catch (InvocationTargetException e) { } catch (InvocationTargetException e) {
e.printStackTrace(); logger.error("extract fail", e);
} }
return o; return o;
} }
private Object convert(String value, ObjectFormatter objectFormatter) {
try {
Object format = objectFormatter.format(value);
if (logger.isDebugEnabled()) {
logger.debug("String " + value + " is converted to " + format);
}
return format;
} catch (Exception e) {
logger.error("convert " + value + " to " + objectFormatter.clazz() + " error!", e);
}
return null;
}
private List<Object> convert(List<String> values, ObjectFormatter objectFormatter) {
List<Object> objects = new ArrayList<Object>();
for (String value : values) {
Object converted = convert(value, objectFormatter);
if (converted != null) {
objects.add(converted);
}
}
return objects;
}
private void setField(Object o, FieldExtractor fieldExtractor, Object value) throws IllegalAccessException, InvocationTargetException { private void setField(Object o, FieldExtractor fieldExtractor, Object value) throws IllegalAccessException, InvocationTargetException {
if (value==null){
return;
}
if (fieldExtractor.getSetterMethod() != null) { if (fieldExtractor.getSetterMethod() != null) {
fieldExtractor.getSetterMethod().invoke(o, value); fieldExtractor.getSetterMethod().invoke(o, value);
} }

@ -0,0 +1,41 @@
package us.codecraft.webmagic.model.annotation;
import us.codecraft.webmagic.model.formatter.ObjectFormatter;
import java.lang.annotation.ElementType;
import java.lang.annotation.Retention;
import java.lang.annotation.Target;
/**
* Define how the result string is convert to an object for field.
*
* @author code4crafter@gmail.com <br>
* @since 0.3.2
*/
@Retention(java.lang.annotation.RetentionPolicy.RUNTIME)
@Target({ElementType.FIELD})
public @interface Formatter {
/**
* Set formatter params.
*
* @return formatter params
*/
String[] value();
/**
* Specific the class of field of class of elements in collection for field. <br/>
* It is not necessary to be set because we can detect the class by class of field,
* unless you use a collection as a field. <br/>
*
* @return the class of field
*/
Class subClazz() default Void.class;
/**
* If there are more than one formatter for a class, just specify the implement.
* @return implement
*/
Class<? extends ObjectFormatter> formatter() default ObjectFormatter.class;
}

@ -0,0 +1,150 @@
package us.codecraft.webmagic.model.formatter;
import java.util.Arrays;
import java.util.List;
/**
* @author code4crafter@gmail.com
* @since 0.3.2
*/
public abstract class BasicTypeFormatter<T> implements ObjectFormatter<T> {
@Override
public void initParam(String[] extra) {
}
@Override
public T format(String raw) throws Exception {
if (raw == null) {
return null;
}
raw = raw.trim();
return formatTrimmed(raw);
}
protected abstract T formatTrimmed(String raw) throws Exception;
public static final List<Class<? extends ObjectFormatter>> basicTypeFormatters = Arrays.<Class<? extends ObjectFormatter>>asList(IntegerFormatter.class,
LongFormatter.class, DoubleFormatter.class, FloatFormatter.class, ShortFormatter.class,
CharactorFormatter.class, ByteFormatter.class, BooleanFormatter.class);
public static Class<?> detectBasicClass(Class<?> type) {
if (type.equals(Integer.TYPE) || type.equals(Integer.class)) {
return Integer.class;
} else if (type.equals(Long.TYPE) || type.equals(Long.class)) {
return Long.class;
} else if (type.equals(Double.TYPE) || type.equals(Double.class)) {
return Double.class;
} else if (type.equals(Float.TYPE) || type.equals(Float.class)) {
return Float.class;
} else if (type.equals(Short.TYPE) || type.equals(Short.class)) {
return Short.class;
} else if (type.equals(Character.TYPE) || type.equals(Character.class)) {
return Character.class;
} else if (type.equals(Byte.TYPE) || type.equals(Byte.class)) {
return Byte.class;
} else if (type.equals(Boolean.TYPE) || type.equals(Boolean.class)) {
return Boolean.class;
}
return type;
}
public static class IntegerFormatter extends BasicTypeFormatter<Integer> {
@Override
public Integer formatTrimmed(String raw) throws Exception {
return Integer.parseInt(raw);
}
@Override
public Class<Integer> clazz() {
return Integer.class;
}
}
public static class LongFormatter extends BasicTypeFormatter<Long> {
@Override
public Long formatTrimmed(String raw) throws Exception {
return Long.parseLong(raw);
}
@Override
public Class<Long> clazz() {
return Long.class;
}
}
public static class DoubleFormatter extends BasicTypeFormatter<Double> {
@Override
public Double formatTrimmed(String raw) throws Exception {
return Double.parseDouble(raw);
}
@Override
public Class<Double> clazz() {
return Double.class;
}
}
public static class FloatFormatter extends BasicTypeFormatter<Float> {
@Override
public Float formatTrimmed(String raw) throws Exception {
return Float.parseFloat(raw);
}
@Override
public Class<Float> clazz() {
return Float.class;
}
}
public static class ShortFormatter extends BasicTypeFormatter<Short> {
@Override
public Short formatTrimmed(String raw) throws Exception {
return Short.parseShort(raw);
}
@Override
public Class<Short> clazz() {
return Short.class;
}
}
public static class CharactorFormatter extends BasicTypeFormatter<Character> {
@Override
public Character formatTrimmed(String raw) throws Exception {
return raw.charAt(0);
}
@Override
public Class<Character> clazz() {
return Character.class;
}
}
public static class ByteFormatter extends BasicTypeFormatter<Byte> {
@Override
public Byte formatTrimmed(String raw) throws Exception {
return Byte.parseByte(raw, 10);
}
@Override
public Class<Byte> clazz() {
return Byte.class;
}
}
public static class BooleanFormatter extends BasicTypeFormatter<Boolean> {
@Override
public Boolean formatTrimmed(String raw) throws Exception {
return Boolean.parseBoolean(raw);
}
@Override
public Class<Boolean> clazz() {
return Boolean.class;
}
}
}

@ -0,0 +1,29 @@
package us.codecraft.webmagic.model.formatter;
import org.apache.commons.lang3.time.DateUtils;
import java.util.Date;
/**
* @author code4crafter@gmail.com
* @since 0.3.2
*/
public class DateFormatter implements ObjectFormatter<Date> {
private String[] datePatterns = new String[]{"yyyy-MM-dd HH:mm"};
@Override
public Date format(String raw) throws Exception {
return DateUtils.parseDate(raw, datePatterns);
}
@Override
public Class<Date> clazz() {
return Date.class;
}
@Override
public void initParam(String[] extra) {
datePatterns = extra;
}
}

@ -0,0 +1,14 @@
package us.codecraft.webmagic.model.formatter;
/**
* @author code4crafter@gmail.com
*/
public interface ObjectFormatter<T> {
T format(String raw) throws Exception;
Class<T> clazz();
void initParam(String[] extra);
}

@ -0,0 +1,34 @@
package us.codecraft.webmagic.model.formatter;
import java.util.Map;
import java.util.concurrent.ConcurrentHashMap;
/**
* @author code4crafter@gmail.com
* @since 0.3.2
*/
public class ObjectFormatters {
private static Map<Class, Class<? extends ObjectFormatter>> formatterMap = new ConcurrentHashMap<Class, Class<? extends ObjectFormatter>>();
static {
for (Class<? extends ObjectFormatter> basicTypeFormatter : BasicTypeFormatter.basicTypeFormatters) {
put(basicTypeFormatter);
}
put(DateFormatter.class);
}
public static void put(Class<? extends ObjectFormatter> objectFormatter) {
try {
formatterMap.put(objectFormatter.newInstance().clazz(), objectFormatter);
} catch (InstantiationException e) {
e.printStackTrace();
} catch (IllegalAccessException e) {
e.printStackTrace();
}
}
public static Class<? extends ObjectFormatter> get(Class<?> clazz){
return formatterMap.get(clazz);
}
}

@ -18,7 +18,7 @@ public class MockDownloader implements Downloader{
" <head prefix=\"og: http://ogp.me/ns# fb: http://ogp.me/ns/fb# githubog: http://ogp.me/ns/fb/githubog#\">\n" + " <head prefix=\"og: http://ogp.me/ns# fb: http://ogp.me/ns/fb# githubog: http://ogp.me/ns/fb/githubog#\">\n" +
" <meta charset='utf-8'>\n" + " <meta charset='utf-8'>\n" +
" <meta http-equiv=\"X-UA-Compatible\" content=\"IE=edge\">\n" + " <meta http-equiv=\"X-UA-Compatible\" content=\"IE=edge\">\n" +
" <title>code4craft/webmagic</title>\n" + " <title>code4craft/webmagic · GitHub</title>\n" +
" <link rel=\"search\" type=\"application/opensearchdescription+xml\" href=\"/opensearch.xml\" title=\"GitHub\" />\n" + " <link rel=\"search\" type=\"application/opensearchdescription+xml\" href=\"/opensearch.xml\" title=\"GitHub\" />\n" +
" <link rel=\"fluid-icon\" href=\"https://github.com/fluidicon.png\" title=\"GitHub\" />\n" + " <link rel=\"fluid-icon\" href=\"https://github.com/fluidicon.png\" title=\"GitHub\" />\n" +
" <link rel=\"apple-touch-icon\" sizes=\"57x57\" href=\"/apple-touch-icon-114.png\" />\n" + " <link rel=\"apple-touch-icon\" sizes=\"57x57\" href=\"/apple-touch-icon-114.png\" />\n" +
@ -27,7 +27,7 @@ public class MockDownloader implements Downloader{
" <link rel=\"apple-touch-icon\" sizes=\"144x144\" href=\"/apple-touch-icon-144.png\" />\n" + " <link rel=\"apple-touch-icon\" sizes=\"144x144\" href=\"/apple-touch-icon-144.png\" />\n" +
" <link rel=\"logo\" type=\"image/svg\" href=\"https://github-media-downloads.s3.amazonaws.com/github-logo.svg\" />\n" + " <link rel=\"logo\" type=\"image/svg\" href=\"https://github-media-downloads.s3.amazonaws.com/github-logo.svg\" />\n" +
" <meta property=\"og:image\" content=\"https://github.global.ssl.fastly.net/images/modules/logos_page/Octocat.png\">\n" + " <meta property=\"og:image\" content=\"https://github.global.ssl.fastly.net/images/modules/logos_page/Octocat.png\">\n" +
" <meta name=\"hostname\" content=\"github-fe120-cp1-prd.iad.github.net\">\n" + " <meta name=\"hostname\" content=\"github-fe114-cp1-prd.iad.github.net\">\n" +
" <meta name=\"ruby\" content=\"ruby 1.9.3p194-tcs-github-tcmalloc (2012-05-25, TCS patched 2012-05-27, GitHub v1.0.36) [x86_64-linux]\">\n" + " <meta name=\"ruby\" content=\"ruby 1.9.3p194-tcs-github-tcmalloc (2012-05-25, TCS patched 2012-05-27, GitHub v1.0.36) [x86_64-linux]\">\n" +
" <link rel=\"assets\" href=\"https://github.global.ssl.fastly.net/\">\n" + " <link rel=\"assets\" href=\"https://github.global.ssl.fastly.net/\">\n" +
" <link rel=\"xhr-socket\" href=\"/_sockets\" />\n" + " <link rel=\"xhr-socket\" href=\"/_sockets\" />\n" +
@ -38,7 +38,7 @@ public class MockDownloader implements Downloader{
" <meta name=\"msapplication-TileImage\" content=\"/windows-tile.png\" />\n" + " <meta name=\"msapplication-TileImage\" content=\"/windows-tile.png\" />\n" +
" <meta name=\"msapplication-TileColor\" content=\"#ffffff\" />\n" + " <meta name=\"msapplication-TileColor\" content=\"#ffffff\" />\n" +
" <meta name=\"selected-link\" value=\"repo_source\" data-pjax-transient />\n" + " <meta name=\"selected-link\" value=\"repo_source\" data-pjax-transient />\n" +
" <meta content=\"collector.githubapp.com\" name=\"octolytics-host\" /><meta content=\"github\" name=\"octolytics-app-id\" /><meta content=\"d70ff776-e041-43ec-9e11-6fff09ae6117\" name=\"octolytics-dimension-request_id\" /><meta content=\"1351884\" name=\"octolytics-actor-id\" /><meta content=\"code4craft\" name=\"octolytics-actor-login\" /><meta content=\"6ba594fdd7b6075190d470f5284075cfe97dcb1f80883d29c3d79d927e87ac85\" name=\"octolytics-actor-hash\" />\n" + " <meta content=\"collector.githubapp.com\" name=\"octolytics-host\" /><meta content=\"github\" name=\"octolytics-app-id\" /><meta content=\"D2167A02:4E87:89497A:523FCC67\" name=\"octolytics-dimension-request_id\" />\n" +
" \n" + " \n" +
"\n" + "\n" +
" \n" + " \n" +
@ -48,15 +48,16 @@ public class MockDownloader implements Downloader{
" <meta content=\"authenticity_token\" name=\"csrf-param\" />\n" + " <meta content=\"authenticity_token\" name=\"csrf-param\" />\n" +
"<meta content=\"i4/tXwrpqoMtPPKJTN4eSSPnFfrSzZkuIkeP//SUW34=\" name=\"csrf-token\" />\n" + "<meta content=\"i4/tXwrpqoMtPPKJTN4eSSPnFfrSzZkuIkeP//SUW34=\" name=\"csrf-token\" />\n" +
"\n" + "\n" +
" <link href=\"https://github.global.ssl.fastly.net/assets/github-8d13b140cf7e2873c4dd1e0f589136f0e71bd381.css\" media=\"all\" rel=\"stylesheet\" type=\"text/css\" />\n" + " <link href=\"https://github.global.ssl.fastly.net/assets/github-4d622651f87d0cfd8c33f1c020455121d2af0be0.css\" media=\"all\" rel=\"stylesheet\" type=\"text/css\" />\n" +
" <link href=\"https://github.global.ssl.fastly.net/assets/github2-d75c750a6b14571dc070b6570d9224acd7b6795e.css\" media=\"all\" rel=\"stylesheet\" type=\"text/css\" />\n" + " <link href=\"https://github.global.ssl.fastly.net/assets/github2-2c867c2081830b4a942703b9d3d565bf90f6046d.css\" media=\"all\" rel=\"stylesheet\" type=\"text/css\" />\n" +
" \n" + " \n" +
"\n" + "\n" +
" \n" +
"\n" + "\n" +
" <script src=\"https://github.global.ssl.fastly.net/assets/frameworks-f86a2975a82dceee28e5afe598d1ebbfd7109d79.js\" type=\"text/javascript\"></script>\n" + " <script src=\"https://github.global.ssl.fastly.net/assets/frameworks-8db79d6d3d61c3bdec72ede901c2b6dbd4a79dad.js\" type=\"text/javascript\"></script>\n" +
" <script src=\"https://github.global.ssl.fastly.net/assets/github-5289a6d6f7dbb5c517007827e10db51fd3ea0251.js\" type=\"text/javascript\"></script>\n" + " <script src=\"https://github.global.ssl.fastly.net/assets/github-0053cb56d6961482e50d72f8e19dc915009ce6b7.js\" type=\"text/javascript\"></script>\n" +
" \n" + " \n" +
" <meta http-equiv=\"x-pjax-version\" content=\"119d1d5ab0189c49025edd294a6b79f2\">\n" + " <meta http-equiv=\"x-pjax-version\" content=\"b5479068af2118811ca4dcd8c0c29e66\">\n" +
"\n" + "\n" +
" <meta property=\"og:title\" content=\"webmagic\"/>\n" + " <meta property=\"og:title\" content=\"webmagic\"/>\n" +
" <meta property=\"og:type\" content=\"githubog:gitrepository\"/>\n" + " <meta property=\"og:type\" content=\"githubog:gitrepository\"/>\n" +
@ -73,37 +74,42 @@ public class MockDownloader implements Downloader{
" </head>\n" + " </head>\n" +
"\n" + "\n" +
"\n" + "\n" +
" <body class=\"logged_in env-production macintosh vis-public\">\n" + " <body class=\"logged_out env-production macintosh vis-public\">\n" +
" <div class=\"wrapper\">\n" + " <div class=\"wrapper\">\n" +
" \n" + " \n" +
" \n" + " \n" +
" \n" + " \n" +
"\n" + "\n" +
"\n" + "\n" +
" <div class=\"header header-logged-in true\">\n" + " \n" +
" <div class=\"header header-logged-out\">\n" +
" <div class=\"container clearfix\">\n" + " <div class=\"container clearfix\">\n" +
"\n" + "\n" +
" <a class=\"header-logo-invertocat\" href=\"https://github.com/\">\n" + " <a class=\"header-logo-wordmark\" href=\"https://github.com/\">\n" +
" <span class=\"mega-octicon octicon-mark-github\"></span>\n" + " <span class=\"mega-octicon octicon-logo-github\"></span>\n" +
"</a>\n" + " </a>\n" +
"\n" +
" <div class=\"divider-vertical\"></div>\n" +
"\n" + "\n" +
" \n" + " <div class=\"header-actions\">\n" +
" <a href=\"/notifications\" class=\"notification-indicator tooltipped downwards\" data-gotokey=\"n\" title=\"You have no unread notifications\">\n" + " <a class=\"button primary\" href=\"/signup\">Sign up</a>\n" +
" <span class=\"mail-status all-read\"></span>\n" + " <a class=\"button signin\" href=\"/login?return_to=%2Fcode4craft%2Fwebmagic\">Sign in</a>\n" +
"</a> <div class=\"divider-vertical\"></div>\n" + " </div>\n" +
"\n" + "\n" +
" <div class=\"command-bar js-command-bar in-repository\">\n" +
"\n" + "\n" +
" <div class=\"command-bar js-command-bar in-repository\">\n" + " <ul class=\"top-nav\">\n" +
" <form accept-charset=\"UTF-8\" action=\"/search\" class=\"command-bar-form\" id=\"top_search_form\" method=\"get\">\n" + " <li class=\"explore\"><a href=\"/explore\">Explore</a></li>\n" +
" <li class=\"features\"><a href=\"/features\">Features</a></li>\n" +
" <li class=\"enterprise\"><a href=\"https://enterprise.github.com/\">Enterprise</a></li>\n" +
" <li class=\"blog\"><a href=\"/blog\">Blog</a></li>\n" +
" </ul>\n" +
" <form accept-charset=\"UTF-8\" action=\"/search\" class=\"command-bar-form\" id=\"top_search_form\" method=\"get\">\n" +
"\n" + "\n" +
"<input type=\"text\" data-hotkey=\"/ s\" name=\"q\" id=\"js-command-bar-field\" placeholder=\"Search or type a command\" tabindex=\"1\" autocapitalize=\"off\"\n" + "<input type=\"text\" data-hotkey=\"/ s\" name=\"q\" id=\"js-command-bar-field\" placeholder=\"Search or type a command\" tabindex=\"1\" autocapitalize=\"off\"\n" +
" \n" + " \n" +
" data-username=\"code4craft\"\n" + " \n" +
" data-repo=\"code4craft/webmagic\"\n" + " data-repo=\"code4craft/webmagic\"\n" +
" data-branch=\"master\"\n" + " data-branch=\"master\"\n" +
" data-sha=\"e4a0a442b4476c547e95db5cdaa06e2274cac38f\"\n" + " data-sha=\"c5ed5916d20b96963d906dde8bccc3627568e486\"\n" +
" >\n" + " >\n" +
"\n" + "\n" +
" <input type=\"hidden\" name=\"nwo\" value=\"code4craft/webmagic\" />\n" + " <input type=\"hidden\" name=\"nwo\" value=\"code4craft/webmagic\" />\n" +
@ -140,84 +146,15 @@ public class MockDownloader implements Downloader{
" <input type=\"hidden\" name=\"ref\" value=\"cmdform\">\n" + " <input type=\"hidden\" name=\"ref\" value=\"cmdform\">\n" +
"\n" + "\n" +
"</form>\n" + "</form>\n" +
" <ul class=\"top-nav\">\n" + " </div>\n" +
" <li class=\"explore\"><a href=\"/explore\">Explore</a></li>\n" +
" <li><a href=\"https://gist.github.com\">Gist</a></li>\n" +
" <li><a href=\"/blog\">Blog</a></li>\n" +
" <li><a href=\"https://help.github.com\">Help</a></li>\n" +
" </ul>\n" +
" </div>\n" +
"\n" +
" \n" +
"\n" +
"\n" +
" <ul id=\"user-links\">\n" +
" <li>\n" +
" <a href=\"/code4craft\" class=\"name\">\n" +
" <img height=\"20\" src=\"https://2.gravatar.com/avatar/4ce9123a05ae222d71d2857316cbe699?d=https%3A%2F%2Fidenticons.github.com%2F19ef9dc10e8399f81a8944a399812d77.png&amp;s=140\" width=\"20\" /> code4craft\n" +
" </a>\n" +
" </li>\n" +
"\n" +
" <li>\n" +
" <a href=\"/new\" id=\"new_repo\" class=\"tooltipped downwards\" title=\"Create a new repo\" aria-label=\"Create a new repo\">\n" +
" <span class=\"octicon octicon-repo-create\"></span>\n" +
" </a>\n" +
" </li>\n" +
"\n" +
" <li>\n" +
" <a href=\"/settings/profile\" id=\"account_settings\"\n" +
" class=\"tooltipped downwards\"\n" +
" aria-label=\"Account settings \"\n" +
" title=\"Account settings \">\n" +
" <span class=\"octicon octicon-tools\"></span>\n" +
" </a>\n" +
" </li>\n" +
" <li>\n" +
" <a class=\"tooltipped downwards\" href=\"/logout\" data-method=\"post\" id=\"logout\" title=\"Sign out\" aria-label=\"Sign out\">\n" +
" <span class=\"octicon octicon-log-out\"></span>\n" +
" </a>\n" +
" </li>\n" +
"\n" +
" </ul>\n" +
"\n" +
"<div class=\"js-new-dropdown-contents hidden\">\n" +
" \n" +
"\n" +
"<ul class=\"dropdown-menu\">\n" +
" <li>\n" +
" <a href=\"/new\"><span class=\"octicon octicon-repo-create\"></span> New repository</a>\n" +
" </li>\n" +
" <li>\n" +
" <a href=\"/organizations/new\"><span class=\"octicon octicon-organization\"></span> New organization</a>\n" +
" </li>\n" +
"\n" +
"\n" +
"\n" +
" <li class=\"section-title\">\n" +
" <span title=\"code4craft/webmagic\">This repository</span>\n" +
" </li>\n" +
" <li>\n" +
" <a href=\"/code4craft/webmagic/issues/new\"><span class=\"octicon octicon-issue-opened\"></span> New issue</a>\n" +
" </li>\n" +
" <li>\n" +
" <a href=\"/code4craft/webmagic/settings/collaboration\"><span class=\"octicon octicon-person-add\"></span> New collaborator</a>\n" +
" </li>\n" +
"</ul>\n" +
"\n" +
"</div>\n" +
"\n" +
"\n" + "\n" +
" \n" +
" </div>\n" + " </div>\n" +
"</div>\n" + "</div>\n" +
"\n" + "\n" +
" \n" +
"\n" + "\n" +
" \n" + " \n" +
"\n" + "\n" +
"\n" + "\n" +
"\n" +
"\n" +
" <div class=\"site\" itemscope itemtype=\"http://schema.org/WebPage\">\n" + " <div class=\"site\" itemscope itemtype=\"http://schema.org/WebPage\">\n" +
" \n" + " \n" +
" <div class=\"pagehead repohead instapaper_ignore readability-menu\">\n" + " <div class=\"pagehead repohead instapaper_ignore readability-menu\">\n" +
@ -226,100 +163,29 @@ public class MockDownloader implements Downloader{
"\n" + "\n" +
"<ul class=\"pagehead-actions\">\n" + "<ul class=\"pagehead-actions\">\n" +
"\n" + "\n" +
" <li class=\"subscription\">\n" +
" <form accept-charset=\"UTF-8\" action=\"/notifications/subscribe\" class=\"js-social-container\" data-autosubmit=\"true\" data-remote=\"true\" method=\"post\"><div style=\"margin:0;padding:0;display:inline\"><input name=\"authenticity_token\" type=\"hidden\" value=\"i4/tXwrpqoMtPPKJTN4eSSPnFfrSzZkuIkeP//SUW34=\" /></div> <input id=\"repository_id\" name=\"repository_id\" type=\"hidden\" value=\"9623064\" />\n" +
"\n" +
" <div class=\"select-menu js-menu-container js-select-menu\">\n" +
" <a class=\"social-count js-social-count\" href=\"/code4craft/webmagic/watchers\">\n" +
" 23\n" +
" </a>\n" +
" <span class=\"minibutton select-menu-button with-count js-menu-target\">\n" +
" <span class=\"js-select-button\">\n" +
" <span class=\"octicon octicon-eye-unwatch\"></span>\n" +
" Unwatch\n" +
" </span>\n" +
" </span>\n" +
"\n" +
" <div class=\"select-menu-modal-holder\">\n" +
" <div class=\"select-menu-modal subscription-menu-modal js-menu-content\">\n" +
" <div class=\"select-menu-header\">\n" +
" <span class=\"select-menu-title\">Notification status</span>\n" +
" <span class=\"octicon octicon-remove-close js-menu-close\"></span>\n" +
" </div> <!-- /.select-menu-header -->\n" +
"\n" +
" <div class=\"select-menu-list js-navigation-container\">\n" +
"\n" +
" <div class=\"select-menu-item js-navigation-item \">\n" +
" <span class=\"select-menu-item-icon octicon octicon-check\"></span>\n" +
" <div class=\"select-menu-item-text\">\n" +
" <input id=\"do_included\" name=\"do\" type=\"radio\" value=\"included\" />\n" +
" <h4>Not watching</h4>\n" +
" <span class=\"description\">You only receive notifications for discussions in which you participate or are @mentioned.</span>\n" +
" <span class=\"js-select-button-text hidden-select-button-text\">\n" +
" <span class=\"octicon octicon-eye-watch\"></span>\n" +
" Watch\n" +
" </span>\n" +
" </div>\n" +
" </div> <!-- /.select-menu-item -->\n" +
"\n" +
" <div class=\"select-menu-item js-navigation-item selected\">\n" +
" <span class=\"select-menu-item-icon octicon octicon octicon-check\"></span>\n" +
" <div class=\"select-menu-item-text\">\n" +
" <input checked=\"checked\" id=\"do_subscribed\" name=\"do\" type=\"radio\" value=\"subscribed\" />\n" +
" <h4>Watching</h4>\n" +
" <span class=\"description\">You receive notifications for all discussions in this repository.</span>\n" +
" <span class=\"js-select-button-text hidden-select-button-text\">\n" +
" <span class=\"octicon octicon-eye-unwatch\"></span>\n" +
" Unwatch\n" +
" </span>\n" +
" </div>\n" +
" </div> <!-- /.select-menu-item -->\n" +
"\n" +
" <div class=\"select-menu-item js-navigation-item \">\n" +
" <span class=\"select-menu-item-icon octicon octicon-check\"></span>\n" +
" <div class=\"select-menu-item-text\">\n" +
" <input id=\"do_ignore\" name=\"do\" type=\"radio\" value=\"ignore\" />\n" +
" <h4>Ignoring</h4>\n" +
" <span class=\"description\">You do not receive any notifications for discussions in this repository.</span>\n" +
" <span class=\"js-select-button-text hidden-select-button-text\">\n" +
" <span class=\"octicon octicon-mute\"></span>\n" +
" Stop ignoring\n" +
" </span>\n" +
" </div>\n" +
" </div> <!-- /.select-menu-item -->\n" +
"\n" +
" </div> <!-- /.select-menu-list -->\n" +
"\n" +
" </div> <!-- /.select-menu-modal -->\n" +
" </div> <!-- /.select-menu-modal-holder -->\n" +
" </div> <!-- /.select-menu -->\n" +
"\n" +
"</form>\n" +
" </li>\n" +
"\n" + "\n" +
" <li>\n" + " <li>\n" +
" \n" + " <a href=\"/login?return_to=%2Fcode4craft%2Fwebmagic\"\n" +
"<div class=\"js-toggler-container js-social-container starring-container \">\n" + " class=\"minibutton with-count js-toggler-target star-button entice tooltipped upwards\"\n" +
" <a href=\"/code4craft/webmagic/unstar\" class=\"minibutton with-count js-toggler-target star-button starred upwards\" title=\"Unstar this repo\" data-remote=\"true\" data-method=\"post\" rel=\"nofollow\">\n" + " title=\"You must be signed in to use this feature\" rel=\"nofollow\">\n" +
" <span class=\"octicon octicon-star-delete\"></span><span class=\"text\">Unstar</span>\n" + " <span class=\"octicon octicon-star\"></span>Star\n" +
" </a>\n" + "</a>\n" +
" <a href=\"/code4craft/webmagic/star\" class=\"minibutton with-count js-toggler-target star-button unstarred upwards\" title=\"Star this repo\" data-remote=\"true\" data-method=\"post\" rel=\"nofollow\">\n" + "<a class=\"social-count js-social-count\" href=\"/code4craft/webmagic/stargazers\">\n" +
" <span class=\"octicon octicon-star\"></span><span class=\"text\">Star</span>\n" + " 86\n" +
" </a>\n" + "</a>\n" +
" <a class=\"social-count js-social-count\" href=\"/code4craft/webmagic/stargazers\">78</a>\n" +
"</div>\n" +
"\n" + "\n" +
" </li>\n" + " </li>\n" +
"\n" + "\n" +
"\n" + " <li>\n" +
" <li>\n" + " <a href=\"/login?return_to=%2Fcode4craft%2Fwebmagic\"\n" +
" <a href=\"/code4craft/webmagic/fork\" class=\"minibutton with-count js-toggler-target fork-button lighter upwards\" title=\"Fork this repo\" rel=\"nofollow\" data-method=\"post\">\n" + " class=\"minibutton with-count js-toggler-target fork-button entice tooltipped upwards\"\n" +
" <span class=\"octicon octicon-git-branch-create\"></span><span class=\"text\">Fork</span>\n" + " title=\"You must be signed in to fork a repository\" rel=\"nofollow\">\n" +
" </a>\n" + " <span class=\"octicon octicon-git-branch\"></span>Fork\n" +
" <a href=\"/code4craft/webmagic/network\" class=\"social-count\">65</a>\n" + " </a>\n" +
" </li>\n" + " <a href=\"/code4craft/webmagic/network\" class=\"social-count\">\n" +
"\n" + " 70\n" +
"\n" + " </a>\n" +
" </li>\n" +
"</ul>\n" + "</ul>\n" +
"\n" + "\n" +
" <h1 itemscope itemtype=\"http://data-vocabulary.org/Breadcrumb\" class=\"entry-title public\">\n" + " <h1 itemscope itemtype=\"http://data-vocabulary.org/Breadcrumb\" class=\"entry-title public\">\n" +
@ -357,7 +223,7 @@ public class MockDownloader implements Downloader{
" <li class=\"tooltipped leftwards\" title=\"Issues\">\n" + " <li class=\"tooltipped leftwards\" title=\"Issues\">\n" +
" <a href=\"/code4craft/webmagic/issues\" aria-label=\"Issues\" class=\"js-selected-navigation-item js-disable-pjax\" data-gotokey=\"i\" data-selected-links=\"repo_issues /code4craft/webmagic/issues\">\n" + " <a href=\"/code4craft/webmagic/issues\" aria-label=\"Issues\" class=\"js-selected-navigation-item js-disable-pjax\" data-gotokey=\"i\" data-selected-links=\"repo_issues /code4craft/webmagic/issues\">\n" +
" <span class=\"octicon octicon-issue-opened\"></span> <span class=\"full-word\">Issues</span>\n" + " <span class=\"octicon octicon-issue-opened\"></span> <span class=\"full-word\">Issues</span>\n" +
" <span class='counter'>7</span>\n" + " <span class='counter'>2</span>\n" +
" <img alt=\"Octocat-spinner-32\" class=\"mini-loader\" height=\"16\" src=\"https://github.global.ssl.fastly.net/images/spinners/octocat-spinner-32.gif\" width=\"16\" />\n" + " <img alt=\"Octocat-spinner-32\" class=\"mini-loader\" height=\"16\" src=\"https://github.global.ssl.fastly.net/images/spinners/octocat-spinner-32.gif\" width=\"16\" />\n" +
"</a> </li>\n" + "</a> </li>\n" +
"\n" + "\n" +
@ -397,15 +263,6 @@ public class MockDownloader implements Downloader{
" </ul>\n" + " </ul>\n" +
"\n" + "\n" +
"\n" + "\n" +
" <div class=\"repo-menu-separator\"></div>\n" +
" <ul class=\"repo-menu\">\n" +
" <li class=\"tooltipped leftwards\" title=\"Settings\">\n" +
" <a href=\"/code4craft/webmagic/settings\" data-pjax aria-label=\"Settings\">\n" +
" <span class=\"octicon octicon-tools\"></span> <span class=\"full-word\">Settings</span>\n" +
" <img alt=\"Octocat-spinner-32\" class=\"mini-loader\" height=\"16\" src=\"https://github.global.ssl.fastly.net/images/spinners/octocat-spinner-32.gif\" width=\"16\" />\n" +
" </a>\n" +
" </li>\n" +
" </ul>\n" +
" </div>\n" + " </div>\n" +
"</div>\n" + "</div>\n" +
"\n" + "\n" +
@ -416,9 +273,8 @@ public class MockDownloader implements Downloader{
"\n" + "\n" +
"<div class=\"clone-url open\"\n" + "<div class=\"clone-url open\"\n" +
" data-protocol-type=\"http\"\n" + " data-protocol-type=\"http\"\n" +
" data-url=\"/users/set_protocol?protocol_selector=http&amp;protocol_type=push\">\n" + " data-url=\"/users/set_protocol?protocol_selector=http&amp;protocol_type=clone\">\n" +
" <h3><strong>HTTPS</strong> clone URL</h3>\n" + " <h3><strong>HTTPS</strong> clone URL</h3>\n" +
"\n" +
" <div class=\"clone-url-box\">\n" + " <div class=\"clone-url-box\">\n" +
" <input type=\"text\" class=\"clone js-url-field\"\n" + " <input type=\"text\" class=\"clone js-url-field\"\n" +
" value=\"https://github.com/code4craft/webmagic.git\" readonly=\"readonly\">\n" + " value=\"https://github.com/code4craft/webmagic.git\" readonly=\"readonly\">\n" +
@ -430,25 +286,9 @@ public class MockDownloader implements Downloader{
" \n" + " \n" +
"\n" + "\n" +
"<div class=\"clone-url \"\n" + "<div class=\"clone-url \"\n" +
" data-protocol-type=\"ssh\"\n" +
" data-url=\"/users/set_protocol?protocol_selector=ssh&amp;protocol_type=push\">\n" +
" <h3><strong>SSH</strong> clone URL</h3>\n" +
"\n" +
" <div class=\"clone-url-box\">\n" +
" <input type=\"text\" class=\"clone js-url-field\"\n" +
" value=\"git@github.com:code4craft/webmagic.git\" readonly=\"readonly\">\n" +
"\n" +
" <span class=\"js-zeroclipboard url-box-clippy minibutton zeroclipboard-button\" data-clipboard-text=\"git@github.com:code4craft/webmagic.git\" data-copied-hint=\"copied!\" title=\"copy to clipboard\"><span class=\"octicon octicon-clippy\"></span></span>\n" +
" </div>\n" +
"</div>\n" +
"\n" +
" \n" +
"\n" +
"<div class=\"clone-url \"\n" +
" data-protocol-type=\"subversion\"\n" + " data-protocol-type=\"subversion\"\n" +
" data-url=\"/users/set_protocol?protocol_selector=subversion&amp;protocol_type=push\">\n" + " data-url=\"/users/set_protocol?protocol_selector=subversion&amp;protocol_type=clone\">\n" +
" <h3><strong>Subversion</strong> checkout URL</h3>\n" + " <h3><strong>Subversion</strong> checkout URL</h3>\n" +
"\n" +
" <div class=\"clone-url-box\">\n" + " <div class=\"clone-url-box\">\n" +
" <input type=\"text\" class=\"clone js-url-field\"\n" + " <input type=\"text\" class=\"clone js-url-field\"\n" +
" value=\"https://github.com/code4craft/webmagic\" readonly=\"readonly\">\n" + " value=\"https://github.com/code4craft/webmagic\" readonly=\"readonly\">\n" +
@ -458,15 +298,17 @@ public class MockDownloader implements Downloader{
"</div>\n" + "</div>\n" +
"\n" + "\n" +
"\n" + "\n" +
"\n" +
"<p class=\"clone-options\">You can clone with\n" + "<p class=\"clone-options\">You can clone with\n" +
" <a href=\"#\" class=\"js-clone-selector\" data-protocol=\"http\">HTTPS</a>,\n" + " <a href=\"#\" class=\"js-clone-selector\" data-protocol=\"http\">HTTPS</a>,\n" +
" <a href=\"#\" class=\"js-clone-selector\" data-protocol=\"ssh\">SSH</a>,\n" + " or <a href=\"#\" class=\"js-clone-selector\" data-protocol=\"subversion\">Subversion</a>.\n" +
" <a href=\"#\" class=\"js-clone-selector\" data-protocol=\"subversion\">Subversion</a>,\n" + " <span class=\"octicon help tooltipped upwards\" title=\"Get help on which URL is right for you.\">\n" +
" and <a href=\"https://help.github.com/articles/which-remote-url-should-i-use\">other methods.</a>\n" + " <a href=\"https://help.github.com/articles/which-remote-url-should-i-use\">\n" +
" <span class=\"octicon octicon-question\"></span>\n" +
" </a>\n" +
" </span>\n" +
"</p>\n" + "</p>\n" +
"\n" + "\n" +
" <a href=\"github-mac://openRepo/https://github.com/code4craft/webmagic\" class=\"minibutton sidebar-button\">\n" + " <a href=\"http://mac.github.com\" class=\"minibutton sidebar-button\">\n" +
" <span class=\"octicon octicon-device-desktop\"></span>\n" + " <span class=\"octicon octicon-device-desktop\"></span>\n" +
" Clone in Desktop\n" + " Clone in Desktop\n" +
" </a>\n" + " </a>\n" +
@ -492,22 +334,7 @@ public class MockDownloader implements Downloader{
" </div>\n" + " </div>\n" +
"\n" + "\n" +
"\n" + "\n" +
" <span class=\"edit-link js-details-show js-details-target\">— <a href=\"#\">Edit</a></span>\n" +
" <form accept-charset=\"UTF-8\" action=\"/code4craft/webmagic/settings/update_meta\" class=\"edit-repository-meta js-details-edit\" method=\"post\"><div style=\"margin:0;padding:0;display:inline\"><input name=\"_method\" type=\"hidden\" value=\"put\" /><input name=\"authenticity_token\" type=\"hidden\" value=\"i4/tXwrpqoMtPPKJTN4eSSPnFfrSzZkuIkeP//SUW34=\" /></div>\n" +
"\n" +
" <div class=\"field description-field\">\n" +
" <label for=\"repo_description\">Description</label>\n" +
" <input type=\"text\" name=\"repo_description\" value=\"A scalable web crawler framework.\" placeholder=\"Short description of this repository\" />\n" +
" </div>\n" +
"\n" +
" <div class=\"field website-field\" >\n" +
" <label for=\"repo_homepage\">Website</label>\n" +
" <input type=\"text\" name=\"repo_homepage\" value=\"\" placeholder=\"Website for this repository (optional)\" />\n" +
" </div>\n" +
"\n" + "\n" +
" <button class=\"button classy\">Save</button>\n" +
" <span class=\"cancel\">or <a href=\"#\" class=\"js-details-target\">cancel</a></a>\n" +
"</form>\n" +
"</div>\n" + "</div>\n" +
"\n" + "\n" +
"<div class=\"capped-box overall-summary \">\n" + "<div class=\"capped-box overall-summary \">\n" +
@ -519,7 +346,7 @@ public class MockDownloader implements Downloader{
" <a data-pjax href=\"/code4craft/webmagic/commits/master\">\n" + " <a data-pjax href=\"/code4craft/webmagic/commits/master\">\n" +
" <span class=\"num\">\n" + " <span class=\"num\">\n" +
" <span class=\"octicon octicon-history\"></span>\n" + " <span class=\"octicon octicon-history\"></span>\n" +
" 299\n" + " 311\n" +
" </span>\n" + " </span>\n" +
" commits\n" + " commits\n" +
" </a>\n" + " </a>\n" +
@ -538,7 +365,7 @@ public class MockDownloader implements Downloader{
" <a data-pjax href=\"/code4craft/webmagic/releases\">\n" + " <a data-pjax href=\"/code4craft/webmagic/releases\">\n" +
" <span class=\"num\">\n" + " <span class=\"num\">\n" +
" <span class=\"octicon octicon-tag\"></span>\n" + " <span class=\"octicon octicon-tag\"></span>\n" +
" 4\n" + " 5\n" +
" </span>\n" + " </span>\n" +
" releases\n" + " releases\n" +
" </a>\n" + " </a>\n" +
@ -561,7 +388,7 @@ public class MockDownloader implements Downloader{
" <a href=\"/code4craft/webmagic/search?l=java\">\n" + " <a href=\"/code4craft/webmagic/search?l=java\">\n" +
" <span class=\"color-block language-color\" style=\"background-color:#b07219;\"></span>\n" + " <span class=\"color-block language-color\" style=\"background-color:#b07219;\"></span>\n" +
" <span class=\"lang\">Java</span>\n" + " <span class=\"lang\">Java</span>\n" +
" <span class=\"percent\">100.0%</span>\n" + " <span class=\"percent\">100%</span>\n" +
" </a>\n" + " </a>\n" +
" </li>\n" + " </li>\n" +
" </ol>\n" + " </ol>\n" +
@ -574,14 +401,10 @@ public class MockDownloader implements Downloader{
" class=\"repository-lang-stats-graph js-toggle-lang-stats tooltipped downwards\"\n" + " class=\"repository-lang-stats-graph js-toggle-lang-stats tooltipped downwards\"\n" +
" title=\"Show language statistics\"\n" + " title=\"Show language statistics\"\n" +
" style=\"background-color:#b07219\">\n" + " style=\"background-color:#b07219\">\n" +
" <span class=\"language-color\" style=\"width:100.0%; background-color:#b07219;\" itemprop=\"keywords\">Java</span>\n" + " <span class=\"language-color\" style=\"width:100%; background-color:#b07219;\" itemprop=\"keywords\">Java</span>\n" +
" </a>\n" + " </a>\n" +
"\n" + "\n" +
"\n" + "\n" +
" <div\n" +
" >\n" +
" </div>\n" +
"\n" +
"\n" + "\n" +
"\n" + "\n" +
"<div class=\"file-navigation in-mid-page\">\n" + "<div class=\"file-navigation in-mid-page\">\n" +
@ -595,7 +418,8 @@ public class MockDownloader implements Downloader{
"<div class=\"select-menu js-menu-container js-select-menu\" >\n" + "<div class=\"select-menu js-menu-container js-select-menu\" >\n" +
" <span class=\"minibutton select-menu-button js-menu-target\" data-hotkey=\"w\"\n" + " <span class=\"minibutton select-menu-button js-menu-target\" data-hotkey=\"w\"\n" +
" data-master-branch=\"master\"\n" + " data-master-branch=\"master\"\n" +
" data-ref=\"master\" role=\"button\" aria-label=\"Switch branches or tags\">\n" + " data-ref=\"master\"\n" +
" role=\"button\" aria-label=\"Switch branches or tags\" tabindex=\"0\">\n" +
" <span class=\"octicon octicon-git-branch\"></span>\n" + " <span class=\"octicon octicon-git-branch\"></span>\n" +
" <i>branch:</i>\n" + " <i>branch:</i>\n" +
" <span class=\"js-select-button\">master</span>\n" + " <span class=\"js-select-button\">master</span>\n" +
@ -611,7 +435,7 @@ public class MockDownloader implements Downloader{
"\n" + "\n" +
" <div class=\"select-menu-filters\">\n" + " <div class=\"select-menu-filters\">\n" +
" <div class=\"select-menu-text-filter\">\n" + " <div class=\"select-menu-text-filter\">\n" +
" <input type=\"text\" aria-label=\"Find or create a branch…\" id=\"context-commitish-filter-field\" class=\"js-filterable-field js-navigation-enable\" placeholder=\"Find or create a branch…\">\n" + " <input type=\"text\" aria-label=\"Filter branches/tags\" id=\"context-commitish-filter-field\" class=\"js-filterable-field js-navigation-enable\" placeholder=\"Filter branches/tags\">\n" +
" </div>\n" + " </div>\n" +
" <div class=\"select-menu-tabs\">\n" + " <div class=\"select-menu-tabs\">\n" +
" <ul>\n" + " <ul>\n" +
@ -648,17 +472,7 @@ public class MockDownloader implements Downloader{
" </div> <!-- /.select-menu-item -->\n" + " </div> <!-- /.select-menu-item -->\n" +
" </div>\n" + " </div>\n" +
"\n" + "\n" +
" <form accept-charset=\"UTF-8\" action=\"/code4craft/webmagic/branches\" class=\"js-create-branch select-menu-item select-menu-new-item-form js-navigation-item js-new-item-form\" method=\"post\"><div style=\"margin:0;padding:0;display:inline\"><input name=\"authenticity_token\" type=\"hidden\" value=\"i4/tXwrpqoMtPPKJTN4eSSPnFfrSzZkuIkeP//SUW34=\" /></div>\n" + " <div class=\"select-menu-no-results\">Nothing to show</div>\n" +
" <span class=\"octicon octicon-git-branch-create select-menu-item-icon\"></span>\n" +
" <div class=\"select-menu-item-text\">\n" +
" <h4>Create branch: <span class=\"js-new-item-name\"></span></h4>\n" +
" <span class=\"description\">from master</span>\n" +
" </div>\n" +
" <input type=\"hidden\" name=\"name\" id=\"name\" class=\"js-new-item-value\">\n" +
" <input type=\"hidden\" name=\"branch\" id=\"branch\" value=\"master\" />\n" +
" <input type=\"hidden\" name=\"path\" id=\"branch\" value=\"\" />\n" +
" </form> <!-- /.select-menu-item -->\n" +
"\n" +
" </div> <!-- /.select-menu-list -->\n" + " </div> <!-- /.select-menu-list -->\n" +
"\n" + "\n" +
" <div class=\"select-menu-list select-menu-tab-bucket js-select-menu-tab-bucket\" data-tab-filter=\"tags\">\n" + " <div class=\"select-menu-list select-menu-tab-bucket js-select-menu-tab-bucket\" data-tab-filter=\"tags\">\n" +
@ -667,6 +481,10 @@ public class MockDownloader implements Downloader{
"\n" + "\n" +
" <div class=\"select-menu-item js-navigation-item \">\n" + " <div class=\"select-menu-item js-navigation-item \">\n" +
" <span class=\"select-menu-item-icon octicon octicon-check\"></span>\n" + " <span class=\"select-menu-item-icon octicon octicon-check\"></span>\n" +
" <a href=\"/code4craft/webmagic/tree/webmagic-parent-0.3.1\" class=\"js-navigation-open select-menu-item-text js-select-button-text css-truncate-target\" data-name=\"webmagic-parent-0.3.1\" data-skip-pjax=\"true\" rel=\"nofollow\" title=\"webmagic-parent-0.3.1\">webmagic-parent-0.3.1</a>\n" +
" </div> <!-- /.select-menu-item -->\n" +
" <div class=\"select-menu-item js-navigation-item \">\n" +
" <span class=\"select-menu-item-icon octicon octicon-check\"></span>\n" +
" <a href=\"/code4craft/webmagic/tree/webmagic-parent-0.2.1\" class=\"js-navigation-open select-menu-item-text js-select-button-text css-truncate-target\" data-name=\"webmagic-parent-0.2.1\" data-skip-pjax=\"true\" rel=\"nofollow\" title=\"webmagic-parent-0.2.1\">webmagic-parent-0.2.1</a>\n" + " <a href=\"/code4craft/webmagic/tree/webmagic-parent-0.2.1\" class=\"js-navigation-open select-menu-item-text js-select-button-text css-truncate-target\" data-name=\"webmagic-parent-0.2.1\" data-skip-pjax=\"true\" rel=\"nofollow\" title=\"webmagic-parent-0.2.1\">webmagic-parent-0.2.1</a>\n" +
" </div> <!-- /.select-menu-item -->\n" + " </div> <!-- /.select-menu-item -->\n" +
" <div class=\"select-menu-item js-navigation-item \">\n" + " <div class=\"select-menu-item js-navigation-item \">\n" +
@ -691,13 +509,13 @@ public class MockDownloader implements Downloader{
"</div> <!-- /.select-menu -->\n" + "</div> <!-- /.select-menu -->\n" +
"\n" + "\n" +
"\n" + "\n" +
" <div class=\"breadcrumb\"><span class='repo-root js-repo-root'><span itemscope=\"\" itemtype=\"http://data-vocabulary.org/Breadcrumb\"><a href=\"/code4craft/webmagic\" data-branch=\"master\" data-direction=\"back\" data-pjax=\"true\" itemscope=\"url\"><span itemprop=\"title\">webmagic</span></a></span></span><span class=\"separator\"> / </span><form action=\"/code4craft/webmagic/new/master\" class=\"js-new-blob-form tooltipped rightwards new-file-link\" method=\"post\" title=\"Create a new file here\"><span aria-label=\"Create a new file here\" class=\"js-new-blob-submit octicon octicon-file-add\" data-test-id=\"create-new-git-file\" role=\"button\"></span></form></div>\n" + " <div class=\"breadcrumb\"><span class='repo-root js-repo-root'><span itemscope=\"\" itemtype=\"http://data-vocabulary.org/Breadcrumb\"><a href=\"/code4craft/webmagic\" data-branch=\"master\" data-direction=\"back\" data-pjax=\"true\" itemscope=\"url\"><span itemprop=\"title\">webmagic</span></a></span></span><span class=\"separator\"> / </span><form action=\"/login?return_to=%2Fcode4craft%2Fwebmagic\" class=\"js-new-blob-form tooltipped rightwards new-file-link\" method=\"post\" title=\"Sign in to make or propose changes\"><span aria-label=\"Sign in to make or propose changes\" class=\"js-new-blob-submit octicon octicon-file-add\" data-test-id=\"create-new-git-file\" role=\"button\"></span></form></div>\n" +
"</div>\n" + "</div>\n" +
"\n" + "\n" +
"\n" + "\n" +
"\n" + "\n" +
"<a href=\"/code4craft/webmagic/find/master\"\n" + "<a href=\"/code4craft/webmagic/find/master\"\n" +
" data-hotkey=\"t\" style=\"display:none\" data-pjax>Show File Finder</a>\n" + " data-hotkey=\"t\" class=\"js-show-file-finder\" style=\"display:none\" data-pjax>Show File Finder</a>\n" +
"<div class=\"bubble files-bubble\">\n" + "<div class=\"bubble files-bubble\">\n" +
" <table class=\"files\" data-pjax>\n" + " <table class=\"files\" data-pjax>\n" +
" <thead>\n" + " <thead>\n" +
@ -735,12 +553,12 @@ public class MockDownloader implements Downloader{
" <img alt=\"Octocat-spinner-32\" class=\"spinner\" height=\"16\" src=\"https://github.global.ssl.fastly.net/images/spinners/octocat-spinner-32.gif\" width=\"16\" />\n" + " <img alt=\"Octocat-spinner-32\" class=\"spinner\" height=\"16\" src=\"https://github.global.ssl.fastly.net/images/spinners/octocat-spinner-32.gif\" width=\"16\" />\n" +
" </td>\n" + " </td>\n" +
" <td class=\"content\">\n" + " <td class=\"content\">\n" +
" <span class=\"css-truncate css-truncate-target\"><a href=\"/code4craft/webmagic/tree/master/webmagic-core\" class=\"js-directory-link\" id=\"39809e13bc65c3873f79570b81852d62-947dff73c2eda51ae629fa42d6ace984fa044db6\" title=\"webmagic-core\">webmagic-core</a></span>\n" + " <span class=\"css-truncate css-truncate-target\"><a href=\"/code4craft/webmagic/tree/master/webmagic-core\" class=\"js-directory-link\" id=\"39809e13bc65c3873f79570b81852d62-e96da9edd9329cf8448fed332294dd4575549495\" title=\"webmagic-core\">webmagic-core</a></span>\n" +
" </td>\n" + " </td>\n" +
" <td class=\"message\">\n" + " <td class=\"message\">\n" +
" <span class=\"css-truncate css-truncate-target\"><a href=\"/code4craft/webmagic/commit/c17a31a21d342ddc4349417557bc8b63aba0ba07\" class=\"message\" data-pjax=\"true\" title=\"fix null pointe exception #26\">fix null pointe exception</a> <a href=\"https://github.com/code4craft/webmagic/issues/26\" class=\"issue-link\" title=\"Annotation extactor does not work\">#26</a></span>\n" + " <span class=\"css-truncate css-truncate-target\"><a href=\"/code4craft/webmagic/commit/b131878123cb90f6123255bbd21e71bc70a480b7\" class=\"message\" data-pjax=\"true\" title=\"add example\">add example</a></span>\n" +
" </td>\n" + " </td>\n" +
" <td class=\"age\"><span class=\"css-truncate css-truncate-target\"><time class=\"js-relative-date\" datetime=\"2013-09-08T06:09:49-07:00\" title=\"2013-09-08 06:09:49\">September 08, 2013</time></span></td>\n" + " <td class=\"age\"><span class=\"css-truncate css-truncate-target\"><time class=\"js-relative-date\" datetime=\"2013-09-22T22:01:28-07:00\" title=\"2013-09-22 22:01:28\">September 22, 2013</time></span></td>\n" +
" </tr>\n" + " </tr>\n" +
" <tr class=\"alt\">\n" + " <tr class=\"alt\">\n" +
" <td class=\"icon\">\n" + " <td class=\"icon\">\n" +
@ -748,12 +566,12 @@ public class MockDownloader implements Downloader{
" <img alt=\"Octocat-spinner-32\" class=\"spinner\" height=\"16\" src=\"https://github.global.ssl.fastly.net/images/spinners/octocat-spinner-32.gif\" width=\"16\" />\n" + " <img alt=\"Octocat-spinner-32\" class=\"spinner\" height=\"16\" src=\"https://github.global.ssl.fastly.net/images/spinners/octocat-spinner-32.gif\" width=\"16\" />\n" +
" </td>\n" + " </td>\n" +
" <td class=\"content\">\n" + " <td class=\"content\">\n" +
" <span class=\"css-truncate css-truncate-target\"><a href=\"/code4craft/webmagic/tree/master/webmagic-extension\" class=\"js-directory-link\" id=\"dc82c79bcb262e1942088502bb426876-5dd5a5a2f7e9aa32848ac323e26fb29e35117bce\" title=\"webmagic-extension\">webmagic-extension</a></span>\n" + " <span class=\"css-truncate css-truncate-target\"><a href=\"/code4craft/webmagic/tree/master/webmagic-extension\" class=\"js-directory-link\" id=\"dc82c79bcb262e1942088502bb426876-6f4453065d5b11429731e2a3e71e10f944da2180\" title=\"webmagic-extension\">webmagic-extension</a></span>\n" +
" </td>\n" + " </td>\n" +
" <td class=\"message\">\n" + " <td class=\"message\">\n" +
" <span class=\"css-truncate css-truncate-target\"><a href=\"/code4craft/webmagic/commit/c17a31a21d342ddc4349417557bc8b63aba0ba07\" class=\"message\" data-pjax=\"true\" title=\"fix null pointe exception #26\">fix null pointe exception</a> <a href=\"https://github.com/code4craft/webmagic/issues/26\" class=\"issue-link\" title=\"Annotation extactor does not work\">#26</a></span>\n" + " <span class=\"css-truncate css-truncate-target\"><a href=\"/code4craft/webmagic/commit/b131878123cb90f6123255bbd21e71bc70a480b7\" class=\"message\" data-pjax=\"true\" title=\"add example\">add example</a></span>\n" +
" </td>\n" + " </td>\n" +
" <td class=\"age\"><span class=\"css-truncate css-truncate-target\"><time class=\"js-relative-date\" datetime=\"2013-09-08T06:09:49-07:00\" title=\"2013-09-08 06:09:49\">September 08, 2013</time></span></td>\n" + " <td class=\"age\"><span class=\"css-truncate css-truncate-target\"><time class=\"js-relative-date\" datetime=\"2013-09-22T22:01:28-07:00\" title=\"2013-09-22 22:01:28\">September 22, 2013</time></span></td>\n" +
" </tr>\n" + " </tr>\n" +
" <tr class=\"\">\n" + " <tr class=\"\">\n" +
" <td class=\"icon\">\n" + " <td class=\"icon\">\n" +
@ -774,12 +592,12 @@ public class MockDownloader implements Downloader{
" <img alt=\"Octocat-spinner-32\" class=\"spinner\" height=\"16\" src=\"https://github.global.ssl.fastly.net/images/spinners/octocat-spinner-32.gif\" width=\"16\" />\n" + " <img alt=\"Octocat-spinner-32\" class=\"spinner\" height=\"16\" src=\"https://github.global.ssl.fastly.net/images/spinners/octocat-spinner-32.gif\" width=\"16\" />\n" +
" </td>\n" + " </td>\n" +
" <td class=\"content\">\n" + " <td class=\"content\">\n" +
" <span class=\"css-truncate css-truncate-target\"><a href=\"/code4craft/webmagic/tree/master/webmagic-samples\" class=\"js-directory-link\" id=\"4284b70d4c5e11003fb292b0d0f7539f-3567f90bdc95fbfe3f18913c7c22c9cce3fe6798\" title=\"webmagic-samples\">webmagic-samples</a></span>\n" + " <span class=\"css-truncate css-truncate-target\"><a href=\"/code4craft/webmagic/tree/master/webmagic-samples\" class=\"js-directory-link\" id=\"4284b70d4c5e11003fb292b0d0f7539f-55f538835cd8b15fb4e34c8a0d6491dc9559e610\" title=\"webmagic-samples\">webmagic-samples</a></span>\n" +
" </td>\n" + " </td>\n" +
" <td class=\"message\">\n" + " <td class=\"message\">\n" +
" <span class=\"css-truncate css-truncate-target\"><a href=\"/code4craft/webmagic/commit/e1b6b54097a6657cfe1c43bb99ba8b47518c455f\" class=\"message\" data-pjax=\"true\" title=\"update version for samples\">update version for samples</a></span>\n" + " <span class=\"css-truncate css-truncate-target\"><a href=\"/code4craft/webmagic/commit/95ab4edec3daca3353395909a13085079ff8606b\" class=\"message\" data-pjax=\"true\" title=\"some bugfix\">some bugfix</a></span>\n" +
" </td>\n" + " </td>\n" +
" <td class=\"age\"><span class=\"css-truncate css-truncate-target\"><time class=\"js-relative-date\" datetime=\"2013-09-03T20:07:28-07:00\" title=\"2013-09-03 20:07:28\">September 03, 2013</time></span></td>\n" + " <td class=\"age\"><span class=\"css-truncate css-truncate-target\"><time class=\"js-relative-date\" datetime=\"2013-09-22T17:38:54-07:00\" title=\"2013-09-22 17:38:54\">September 22, 2013</time></span></td>\n" +
" </tr>\n" + " </tr>\n" +
" <tr class=\"\">\n" + " <tr class=\"\">\n" +
" <td class=\"icon\">\n" + " <td class=\"icon\">\n" +
@ -813,12 +631,12 @@ public class MockDownloader implements Downloader{
" <img alt=\"Octocat-spinner-32\" class=\"spinner\" height=\"16\" src=\"https://github.global.ssl.fastly.net/images/spinners/octocat-spinner-32.gif\" width=\"16\" />\n" + " <img alt=\"Octocat-spinner-32\" class=\"spinner\" height=\"16\" src=\"https://github.global.ssl.fastly.net/images/spinners/octocat-spinner-32.gif\" width=\"16\" />\n" +
" </td>\n" + " </td>\n" +
" <td class=\"content\">\n" + " <td class=\"content\">\n" +
" <span class=\"css-truncate css-truncate-target\"><a href=\"/code4craft/webmagic/tree/master/zh_docs\" class=\"js-directory-link\" id=\"bec3b859688b0bbdb94899b1a5b56441-66254ea2ec85e8cf79182bcfe540b699e7e4d206\" title=\"zh_docs\">zh_docs</a></span>\n" + " <span class=\"css-truncate css-truncate-target\"><a href=\"/code4craft/webmagic/tree/master/zh_docs\" class=\"js-directory-link\" id=\"bec3b859688b0bbdb94899b1a5b56441-2cf0c7c178e3e0280b023f54e3ef21e9b7b9e3b3\" title=\"zh_docs\">zh_docs</a></span>\n" +
" </td>\n" + " </td>\n" +
" <td class=\"message\">\n" + " <td class=\"message\">\n" +
" <span class=\"css-truncate css-truncate-target\"><a href=\"/code4craft/webmagic/commit/ac4cd391707da1190744a3891af7c62424fd8d37\" class=\"message\" data-pjax=\"true\" title=\"update version\">update version</a></span>\n" + " <span class=\"css-truncate css-truncate-target\"><a href=\"/code4craft/webmagic/commit/81f75347573f70a39a83afd5d2f7d626b3b305bd\" class=\"message\" data-pjax=\"true\" title=\"update version\">update version</a></span>\n" +
" </td>\n" + " </td>\n" +
" <td class=\"age\"><span class=\"css-truncate css-truncate-target\"><time class=\"js-relative-date\" datetime=\"2013-09-04T05:37:42-07:00\" title=\"2013-09-04 05:37:42\">September 04, 2013</time></span></td>\n" + " <td class=\"age\"><span class=\"css-truncate css-truncate-target\"><time class=\"js-relative-date\" datetime=\"2013-09-12T06:28:42-07:00\" title=\"2013-09-12 06:28:42\">September 12, 2013</time></span></td>\n" +
" </tr>\n" + " </tr>\n" +
" <tr class=\"alt\">\n" + " <tr class=\"alt\">\n" +
" <td class=\"icon\">\n" + " <td class=\"icon\">\n" +
@ -852,12 +670,12 @@ public class MockDownloader implements Downloader{
" <img alt=\"Octocat-spinner-32\" class=\"spinner\" height=\"16\" src=\"https://github.global.ssl.fastly.net/images/spinners/octocat-spinner-32.gif\" width=\"16\" />\n" + " <img alt=\"Octocat-spinner-32\" class=\"spinner\" height=\"16\" src=\"https://github.global.ssl.fastly.net/images/spinners/octocat-spinner-32.gif\" width=\"16\" />\n" +
" </td>\n" + " </td>\n" +
" <td class=\"content\">\n" + " <td class=\"content\">\n" +
" <span class=\"css-truncate css-truncate-target\"><a href=\"/code4craft/webmagic/blob/master/README.md\" class=\"js-directory-link\" id=\"04c6e90faac2675aa89e2176d2eec7d8-5624019f9b5112a3b9d061551c82bf610fbaad7a\" title=\"README.md\">README.md</a></span>\n" + " <span class=\"css-truncate css-truncate-target\"><a href=\"/code4craft/webmagic/blob/master/README.md\" class=\"js-directory-link\" id=\"04c6e90faac2675aa89e2176d2eec7d8-01a868db17802ce7915cc2bcfad10244ef4de064\" title=\"README.md\">README.md</a></span>\n" +
" </td>\n" + " </td>\n" +
" <td class=\"message\">\n" + " <td class=\"message\">\n" +
" <span class=\"css-truncate css-truncate-target\"><a href=\"/code4craft/webmagic/commit/ac4cd391707da1190744a3891af7c62424fd8d37\" class=\"message\" data-pjax=\"true\" title=\"update version\">update version</a></span>\n" + " <span class=\"css-truncate css-truncate-target\"><a href=\"/code4craft/webmagic/commit/a0d64b76357a449386755b9867163c91d04a2426\" class=\"message\" data-pjax=\"true\" title=\"update version\">update version</a></span>\n" +
" </td>\n" + " </td>\n" +
" <td class=\"age\"><span class=\"css-truncate css-truncate-target\"><time class=\"js-relative-date\" datetime=\"2013-09-04T05:37:42-07:00\" title=\"2013-09-04 05:37:42\">September 04, 2013</time></span></td>\n" + " <td class=\"age\"><span class=\"css-truncate css-truncate-target\"><time class=\"js-relative-date\" datetime=\"2013-09-12T06:06:05-07:00\" title=\"2013-09-12 06:06:05\">September 12, 2013</time></span></td>\n" +
" </tr>\n" + " </tr>\n" +
" <tr class=\"\">\n" + " <tr class=\"\">\n" +
" <td class=\"icon\">\n" + " <td class=\"icon\">\n" +
@ -865,12 +683,12 @@ public class MockDownloader implements Downloader{
" <img alt=\"Octocat-spinner-32\" class=\"spinner\" height=\"16\" src=\"https://github.global.ssl.fastly.net/images/spinners/octocat-spinner-32.gif\" width=\"16\" />\n" + " <img alt=\"Octocat-spinner-32\" class=\"spinner\" height=\"16\" src=\"https://github.global.ssl.fastly.net/images/spinners/octocat-spinner-32.gif\" width=\"16\" />\n" +
" </td>\n" + " </td>\n" +
" <td class=\"content\">\n" + " <td class=\"content\">\n" +
" <span class=\"css-truncate css-truncate-target\"><a href=\"/code4craft/webmagic/blob/master/pom.xml\" class=\"js-directory-link\" id=\"600376dffeb79835ede4a0b285078036-4fdfeee1be6d6430c6e402b036df6c6947f0d4da\" title=\"pom.xml\">pom.xml</a></span>\n" + " <span class=\"css-truncate css-truncate-target\"><a href=\"/code4craft/webmagic/blob/master/pom.xml\" class=\"js-directory-link\" id=\"600376dffeb79835ede4a0b285078036-e2685a8ad6dbce1421232fced6e46ed3c8c3efa2\" title=\"pom.xml\">pom.xml</a></span>\n" +
" </td>\n" + " </td>\n" +
" <td class=\"message\">\n" + " <td class=\"message\">\n" +
" <span class=\"css-truncate css-truncate-target\"><a href=\"/code4craft/webmagic/commit/2e8cf0a3dd27503423afe0bc8f3600bcf8ac832b\" class=\"message\" data-pjax=\"true\" title=\"将单元测试fork独立的JVM来跑。避免少数情况默认maven开的JVM堆太小。\">将单元测试fork独立的JVM来跑。避免少数情况默认maven开的JVM堆太小。</a></span>\n" + " <span class=\"css-truncate css-truncate-target\"><a href=\"/code4craft/webmagic/commit/fb693a4ac41667ba70f2d7c11c73b364fa569e67\" class=\"message\" data-pjax=\"true\" title=\"[maven-release-plugin] prepare for next development iteration\">[maven-release-plugin] prepare for next development iteration</a></span>\n" +
" </td>\n" + " </td>\n" +
" <td class=\"age\"><span class=\"css-truncate css-truncate-target\"><time class=\"js-relative-date\" datetime=\"2013-09-04T09:30:10-07:00\" title=\"2013-09-04 09:30:10\">September 04, 2013</time></span></td>\n" + " <td class=\"age\"><span class=\"css-truncate css-truncate-target\"><time class=\"js-relative-date\" datetime=\"2013-09-08T07:25:07-07:00\" title=\"2013-09-08 07:25:07\">September 08, 2013</time></span></td>\n" +
" </tr>\n" + " </tr>\n" +
" <tr class=\"alt\">\n" + " <tr class=\"alt\">\n" +
" <td class=\"icon\">\n" + " <td class=\"icon\">\n" +
@ -932,12 +750,12 @@ public class MockDownloader implements Downloader{
"<pre><code> &lt;dependency&gt;\n" + "<pre><code> &lt;dependency&gt;\n" +
" &lt;groupId&gt;us.codecraft&lt;/groupId&gt;\n" + " &lt;groupId&gt;us.codecraft&lt;/groupId&gt;\n" +
" &lt;artifactId&gt;webmagic-core&lt;/artifactId&gt;\n" + " &lt;artifactId&gt;webmagic-core&lt;/artifactId&gt;\n" +
" &lt;version&gt;0.3.0&lt;/version&gt;\n" + " &lt;version&gt;0.3.1&lt;/version&gt;\n" +
" &lt;/dependency&gt;\n" + " &lt;/dependency&gt;\n" +
" &lt;dependency&gt;\n" + " &lt;dependency&gt;\n" +
" &lt;groupId&gt;us.codecraft&lt;/groupId&gt;\n" + " &lt;groupId&gt;us.codecraft&lt;/groupId&gt;\n" +
" &lt;artifactId&gt;webmagic-extension&lt;/artifactId&gt;\n" + " &lt;artifactId&gt;webmagic-extension&lt;/artifactId&gt;\n" +
" &lt;version&gt;0.3.0&lt;/version&gt;\n" + " &lt;version&gt;0.3.1&lt;/version&gt;\n" +
" &lt;/dependency&gt;\n" + " &lt;/dependency&gt;\n" +
"</code></pre>\n" + "</code></pre>\n" +
"\n" + "\n" +
@ -949,7 +767,7 @@ public class MockDownloader implements Downloader{
"\n" + "\n" +
"<p>Write a class implements PageProcessor</p>\n" + "<p>Write a class implements PageProcessor</p>\n" +
"\n" + "\n" +
"<div class=\"highlight\"><pre> <span class=\"kd\">public</span> <span class=\"kd\">class</span> <span class=\"nc\">OschinaBlogPageProcesser</span> <span class=\"kd\">implements</span> <span class=\"n\">PageProcessor</span> <span class=\"o\">{</span>\n" + "<div class=\"highlight highlight-java\"><pre> <span class=\"kd\">public</span> <span class=\"kd\">class</span> <span class=\"nc\">OschinaBlogPageProcesser</span> <span class=\"kd\">implements</span> <span class=\"n\">PageProcessor</span> <span class=\"o\">{</span>\n" +
"\n" + "\n" +
" <span class=\"kd\">private</span> <span class=\"n\">Site</span> <span class=\"n\">site</span> <span class=\"o\">=</span> <span class=\"n\">Site</span><span class=\"o\">.</span><span class=\"na\">me</span><span class=\"o\">().</span><span class=\"na\">setDomain</span><span class=\"o\">(</span><span class=\"s\">\"my.oschina.net\"</span><span class=\"o\">)</span>\n" + " <span class=\"kd\">private</span> <span class=\"n\">Site</span> <span class=\"n\">site</span> <span class=\"o\">=</span> <span class=\"n\">Site</span><span class=\"o\">.</span><span class=\"na\">me</span><span class=\"o\">().</span><span class=\"na\">setDomain</span><span class=\"o\">(</span><span class=\"s\">\"my.oschina.net\"</span><span class=\"o\">)</span>\n" +
" <span class=\"o\">.</span><span class=\"na\">addStartUrl</span><span class=\"o\">(</span><span class=\"s\">\"http://my.oschina.net/flashsword/blog\"</span><span class=\"o\">);</span>\n" + " <span class=\"o\">.</span><span class=\"na\">addStartUrl</span><span class=\"o\">(</span><span class=\"s\">\"http://my.oschina.net/flashsword/blog\"</span><span class=\"o\">);</span>\n" +
@ -984,7 +802,7 @@ public class MockDownloader implements Downloader{
"</li>\n" + "</li>\n" +
"</ul><p>You can also use annotation way:</p>\n" + "</ul><p>You can also use annotation way:</p>\n" +
"\n" + "\n" +
"<div class=\"highlight\"><pre> <span class=\"nd\">@TargetUrl</span><span class=\"o\">(</span><span class=\"s\">\"http://my.oschina.net/flashsword/blog/\\\\d+\"</span><span class=\"o\">)</span>\n" + "<div class=\"highlight highlight-java\"><pre> <span class=\"nd\">@TargetUrl</span><span class=\"o\">(</span><span class=\"s\">\"http://my.oschina.net/flashsword/blog/\\\\d+\"</span><span class=\"o\">)</span>\n" +
" <span class=\"kd\">public</span> <span class=\"kd\">class</span> <span class=\"nc\">OschinaBlog</span> <span class=\"o\">{</span>\n" + " <span class=\"kd\">public</span> <span class=\"kd\">class</span> <span class=\"nc\">OschinaBlog</span> <span class=\"o\">{</span>\n" +
"\n" + "\n" +
" <span class=\"nd\">@ExtractBy</span><span class=\"o\">(</span><span class=\"s\">\"//title\"</span><span class=\"o\">)</span>\n" + " <span class=\"nd\">@ExtractBy</span><span class=\"o\">(</span><span class=\"s\">\"//title\"</span><span class=\"o\">)</span>\n" +
@ -1071,7 +889,7 @@ public class MockDownloader implements Downloader{
" </a>\n" + " </a>\n" +
"\n" + "\n" +
" <ul class=\"site-footer-links\">\n" + " <ul class=\"site-footer-links\">\n" +
" <li>&copy; 2013 <span title=\"0.08765s from github-fe120-cp1-prd.iad.github.net\">GitHub</span>, Inc.</li>\n" + " <li>&copy; 2013 <span title=\"0.04752s from github-fe114-cp1-prd.iad.github.net\">GitHub</span>, Inc.</li>\n" +
" <li><a href=\"/site/terms\">Terms</a></li>\n" + " <li><a href=\"/site/terms\">Terms</a></li>\n" +
" <li><a href=\"/site/privacy\">Privacy</a></li>\n" + " <li><a href=\"/site/privacy\">Privacy</a></li>\n" +
" <li><a href=\"/security\">Security</a></li>\n" + " <li><a href=\"/security\">Security</a></li>\n" +
@ -1111,7 +929,6 @@ public class MockDownloader implements Downloader{
" Something went wrong with that request. Please try again.\n" + " Something went wrong with that request. Please try again.\n" +
" </div>\n" + " </div>\n" +
"\n" + "\n" +
" \n" +
" </body>\n" + " </body>\n" +
"</html>\n" + "</html>\n" +
"\n"; "\n";

@ -0,0 +1,20 @@
package us.codecraft.webmagic.formatter;
import org.junit.Test;
import us.codecraft.webmagic.model.formatter.DateFormatter;
import java.util.Date;
/**
* @author code4crafter@gmail.com
*/
public class DateFormatterTest {
@Test
public void testDateFormatter() throws Exception {
DateFormatter dateFormatter = new DateFormatter();
dateFormatter.initParam(new String[]{"yyyy-MM-dd HH:mm"});
Date format = dateFormatter.format("2013-09-10 22:11");
System.out.println(format);
}
}

@ -0,0 +1,26 @@
package us.codecraft.webmagic.model;
import junit.framework.Assert;
import org.junit.Test;
import us.codecraft.webmagic.MockDownloader;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.example.GithubRepo;
/**
* @author code4crafter@gmail.com <br>
*/
public class GithubRepoTest {
@Test
public void test() {
OOSpider.create(Site.me().addStartUrl("https://github.com/code4craft/webmagic").setSleepTime(0)
, new PageModelPipeline<GithubRepo>() {
@Override
public void process(GithubRepo o, Task task) {
Assert.assertEquals(86, o.getStar());
Assert.assertEquals(70, o.getFork());
}
}, GithubRepo.class).setDownloader(new MockDownloader()).test("https://github.com/code4craft/webmagic");
}
}

@ -0,0 +1,31 @@
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE log4j:configuration SYSTEM "log4j.dtd">
<log4j:configuration xmlns:log4j="http://jakarta.apache.org/log4j/">
<appender name="stdout" class="org.apache.log4j.ConsoleAppender">
<layout class="org.apache.log4j.PatternLayout">
<param name="ConversionPattern" value="%d{yy-MM-dd HH:mm:ss,SSS} %-5p %c(%F:%L) ## %m%n" />
</layout>
</appender>
<logger name="org.springframework" additivity="false">
<level value="warn" />
<appender-ref ref="stdout" />
</logger>
<logger name="org.apache" additivity="false">
<level value="warn" />
<appender-ref ref="stdout" />
</logger>
<logger name="net.sf.ehcache" additivity="false">
<level value="warn" />
<appender-ref ref="stdout" />
</logger>
<root>
<level value="debug" />
<appender-ref ref="stdout" />
</root>
</log4j:configuration>

@ -5,7 +5,7 @@
<parent> <parent>
<artifactId>webmagic-parent</artifactId> <artifactId>webmagic-parent</artifactId>
<groupId>us.codecraft</groupId> <groupId>us.codecraft</groupId>
<version>0.3.1</version> <version>0.3.2</version>
</parent> </parent>
<modelVersion>4.0.0</modelVersion> <modelVersion>4.0.0</modelVersion>

@ -1,7 +1,6 @@
package us.codecraft.webmagic.model.samples; package us.codecraft.webmagic.model.samples;
import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.model.HasKey;
import us.codecraft.webmagic.model.OOSpider; import us.codecraft.webmagic.model.OOSpider;
import us.codecraft.webmagic.model.annotation.ExtractBy; import us.codecraft.webmagic.model.annotation.ExtractBy;
import us.codecraft.webmagic.model.annotation.TargetUrl; import us.codecraft.webmagic.model.annotation.TargetUrl;
@ -13,7 +12,7 @@ import java.util.List;
* @author code4crafter@gmail.com <br> * @author code4crafter@gmail.com <br>
*/ */
@TargetUrl("http://my.oschina.net/flashsword/blog/\\d+") @TargetUrl("http://my.oschina.net/flashsword/blog/\\d+")
public class OschinaBlog implements HasKey{ public class OschinaBlog{
@ExtractBy("//title") @ExtractBy("//title")
private String title; private String title;
@ -29,11 +28,6 @@ public class OschinaBlog implements HasKey{
,new JsonFilePageModelPipeline(), OschinaBlog.class).run(); ,new JsonFilePageModelPipeline(), OschinaBlog.class).run();
} }
@Override
public String key() {
return title;
}
public String getTitle() { public String getTitle() {
return title; return title;
} }

@ -34,13 +34,12 @@ webmagic使用maven管理依赖在项目中添加对应的依赖即可使用w
<dependency> <dependency>
<groupId>us.codecraft</groupId> <groupId>us.codecraft</groupId>
<artifactId>webmagic-core</artifactId> <artifactId>webmagic-core</artifactId>
<version>0.3.0</version> <version>0.3.1</version>
</dependency> </dependency>
<dependency> <dependency>
<groupId>us.codecraft</groupId> <groupId>us.codecraft</groupId>
<artifactId>webmagic-extension</artifactId> <artifactId>webmagic-extension</artifactId>
<version>0.3.0 <version>0.3.1</version>
</version>
</dependency> </dependency>
#### 项目结构 #### 项目结构

Loading…
Cancel
Save