update to 0.3.1
@ -1,3 +1,4 @@
@ -1,11 +0,0 @@
#release configuration
#Tue Aug 20 23:36:56 CST 2013
preparationGoals=clean verify
exec.additionalArguments=-Psonatype-oss-release -P development
@ -1,91 +0,0 @@
package us.codecraft.webmagic.selector;
import org.apache.commons.lang3.StringUtils;
import java.lang.reflect.Constructor;
import java.util.Map;
import java.util.concurrent.ConcurrentHashMap;
* Selector factory with some inner cache.<br>
* @author code4crafter@gmail.com <br>
* @since 0.1.0
public class SelectorFactory {
private Map<String, Selector> innerCache = new ConcurrentHashMap<String, Selector>();
private static final SelectorFactory INSTATNCE = new SelectorFactory();
public static SelectorFactory getInstatnce() {
public RegexSelector newRegexSelector(String regex) {
return newSelector(RegexSelector.class, regex);
public RegexSelector newRegexSelector(String regex, int group) {
String cacheKey = getCacheKey(RegexSelector.class, regex, String.valueOf(group));
if (innerCache.get(cacheKey) != null) {
return (RegexSelector) innerCache.get(cacheKey);
return new RegexSelector(regex, group);
public ReplaceSelector newReplaceSelector(String regex, String replacement) {
return newSelector(ReplaceSelector.class, regex, replacement);
public XpathSelector newXpathSelector(String xpath) {
return newSelector(XpathSelector.class, xpath);
public SmartContentSelector newSmartContentSelector() {
return newSelector(SmartContentSelector.class);
public <T extends Selector> T newAndCacheSelector(Class<T> clazz, String... param) {
String cacheKey = getCacheKey(RegexSelector.class, param);
if (innerCache.get(cacheKey) != null) {
return (T) innerCache.get(cacheKey);
T selector = newSelector(clazz, param);
if (selector != null) {
innerCache.put(cacheKey, selector);
return selector;
public <T extends Selector> T newSelector(Class<T> clazz, String... param) {
try {
if (param.length == 0) {
Constructor<T> constructor
= clazz.getConstructor();
T selector = constructor.newInstance();
return selector;
} else if (param.length == 1) {
Constructor<T> constructor
= clazz.getConstructor(String.class);
T selector = constructor.newInstance(param[0]);
return selector;
} else if (param.length == 2) {
Constructor<T> constructor
= clazz.getConstructor(String.class, String.class);
T selector = constructor.newInstance(param[0], param[1]);
return selector;
} else {
throw new UnsupportedOperationException();
} catch (Exception e) {
throw new IllegalArgumentException("init object error", e);
private String getCacheKey(Class<?> clazz, String... param) {
return clazz.toString() + "_" + StringUtils.join(param, "_");
@ -0,0 +1,28 @@
package us.codecraft.webmagic;
import org.junit.Ignore;
import org.junit.Test;
import us.codecraft.webmagic.pipeline.Pipeline;
import us.codecraft.webmagic.processor.SimplePageProcessor;
* @author code4crafter@gmail.com
public class SpiderTest {
@Ignore("long time")
public void testStartAndStop() throws InterruptedException {
Spider spider = Spider.create(new SimplePageProcessor("http://www.oschina.net/", "http://www.oschina.net/*")).addPipeline(new Pipeline() {
public void process(ResultItems resultItems, Task task) {
// spider.run();
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,14 @@
package us.codecraft.webmagic;
import junit.framework.Assert;
import us.codecraft.webmagic.model.PageModelPipeline;
* @author code4crafter@gmail.com
public class MockPageModelPipeline implements PageModelPipeline{
public void process(Object o, Task task) {
@ -0,0 +1,13 @@
package us.codecraft.webmagic;
import us.codecraft.webmagic.pipeline.Pipeline;
* @author code4crafter@gmail.com
public class MockPipeline implements Pipeline{
public void process(ResultItems resultItems, Task task) {
@ -0,0 +1,87 @@
package us.codecraft.webmagic.model;
import junit.framework.Assert;
import org.junit.Test;
import us.codecraft.webmagic.MockDownloader;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.model.annotation.ExtractBy;
import us.codecraft.webmagic.model.annotation.ExtractByUrl;
import us.codecraft.webmagic.model.annotation.HelpUrl;
import us.codecraft.webmagic.model.annotation.TargetUrl;
import java.util.List;
* @author code4crafter@gmail.com <br>
@HelpUrl({"https://github.com/\\w+\\?tab=repositories", "https://github.com/\\w+", "https://github.com/explore/*"})
public class GithubRepo implements HasKey {
@ExtractBy(value = "//h1[@class='entry-title public']/strong/a/text()", notNull = true)
private String name;
private String author;
private String readme;
@ExtractBy(value = "//div[@class='repository-lang-stats']//li//span[@class='lang']", multi = true)
private List<String> language;
@ExtractBy("//ul[@class='pagehead-actions']/li[2]//a[@class='social-count js-social-count']/text()")
private String star;
private String fork;
private String url;
public void test() {
, new PageModelPipeline<GithubRepo>() {
public void process(GithubRepo o, Task task) {
}, GithubRepo.class).setDownloader(new MockDownloader()).test("https://github.com/code4craft/webmagic");
public String key() {
return author + ":" + name;
public String getName() {
return name;
public String getReadme() {
return readme;
public String getAuthor() {
return author;
public List<String> getLanguage() {
return language;
public String getUrl() {
return url;
public String getStar() {
return star;
public String getFork() {
return fork;
@ -0,0 +1,35 @@
package us.codecraft.webmagic.processor;
import junit.framework.Assert;
import org.junit.Test;
import us.codecraft.webmagic.*;
import us.codecraft.webmagic.model.OOSpider;
import us.codecraft.webmagic.pipeline.Pipeline;
* @author code4crafter@gmail.com
public class GithubRepoProcessor implements PageProcessor {
public void process(Page page) {
page.putField("star",page.getHtml().xpath("//ul[@class='pagehead-actions']/li[2]//a[@class='social-count js-social-count']/text()").toString());
public Site getSite() {
return Site.me().addStartUrl("https://github.com/code4craft/webmagic");
public void test() {
OOSpider.create(new GithubRepoProcessor()).addPipeline(new Pipeline() {
public void process(ResultItems resultItems, Task task) {
}).setDownloader(new MockDownloader()).test("https://github.com/code4craft/webmagic");
@ -1,8 +0,0 @@
touch wordpress.xml
cat wp-head.xml >> wordpress.xml
for f in `ls`;
cat ${f} >> ../wordpress.xml
cat wp-bottom.xml >> wordpress.xml
@ -1,22 +0,0 @@
<guid isPermaLink="false">${id}</guid>
@ -1,2 +0,0 @@
@ -1,35 +0,0 @@
<?xml version="1.0" encoding="UTF-8" ?>
<!-- This is a WordPress eXtended RSS file generated by WordPress as an export of your site. -->
<!-- It contains information about your site's posts, pages, comments, categories, and other content. -->
<!-- You may use this file to transfer that content from one site to another. -->
<!-- This file is not intended to serve as a complete backup of your site. -->
<!-- To import this information into a WordPress site follow these steps: -->
<!-- 1. Log in to that site as an administrator. -->
<!-- 2. Go to Tools: Import in the WordPress admin panel. -->
<!-- 3. Install the "WordPress" importer from the list. -->
<!-- 4. Activate & Run Importer. -->
<!-- 5. Upload this file using the form provided on that page. -->
<!-- 6. You will first be asked to map the authors in this export file to users -->
<!-- on the site. For each author, you may choose to map to an -->
<!-- existing user on the site or to create a new user. -->
<!-- 7. WordPress will then import each of the posts, pages, comments, categories, etc. -->
<!-- contained in this file into your site. -->
<!-- generator="WordPress/3.3.1" created="2012-06-10 09:15" -->
<rss version="2.0"
@ -1,28 +0,0 @@
package us.codecraft.webmagic.processor;
import org.junit.Ignore;
import org.junit.Test;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.pipeline.FilePipeline;
import us.codecraft.webmagic.pipeline.JsonFilePipeline;
import us.codecraft.webmagic.samples.DiaoyuwengProcessor;
import us.codecraft.webmagic.scheduler.FileCacheQueueScheduler;
import java.io.IOException;
* @author code4crafter@gmail.com <br>
* Date: 13-6-9
* Time: 上午8:02
public class DiaoyuwengProcessorTest {
public void test() throws IOException {
DiaoyuwengProcessor diaoyuwengProcessor = new DiaoyuwengProcessor();
JsonFilePipeline pipeline = new JsonFilePipeline("/data/webmagic/");
Spider.create(diaoyuwengProcessor).pipeline(new FilePipeline()).pipeline(pipeline).scheduler(new FileCacheQueueScheduler("/data/temp/webmagic/cache/")).
Reference in New Issue