update to 0.3.1
commit
a2fba8caa2
@ -1,3 +1,4 @@
|
||||
target/*
|
||||
*.iml
|
||||
out/
|
||||
.idea
|
||||
|
@ -1,11 +0,0 @@
|
||||
#release configuration
|
||||
#Tue Aug 20 23:36:56 CST 2013
|
||||
scm.tagNameFormat=@{project.artifactId}-@{project.version}
|
||||
pushChanges=true
|
||||
scm.url=scm\:git\:git@github.com\:code4craft/webmagic.git
|
||||
preparationGoals=clean verify
|
||||
remoteTagging=true
|
||||
scm.commentPrefix=[maven-release-plugin]
|
||||
exec.additionalArguments=-Psonatype-oss-release -P development
|
||||
exec.snapshotReleasePluginAllowed=false
|
||||
completedPhase=check-poms
|
@ -1,91 +0,0 @@
|
||||
package us.codecraft.webmagic.selector;
|
||||
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
|
||||
import java.lang.reflect.Constructor;
|
||||
import java.util.Map;
|
||||
import java.util.concurrent.ConcurrentHashMap;
|
||||
|
||||
/**
|
||||
* Selector factory with some inner cache.<br>
|
||||
*
|
||||
* @author code4crafter@gmail.com <br>
|
||||
* @since 0.1.0
|
||||
*/
|
||||
public class SelectorFactory {
|
||||
|
||||
private Map<String, Selector> innerCache = new ConcurrentHashMap<String, Selector>();
|
||||
|
||||
private static final SelectorFactory INSTATNCE = new SelectorFactory();
|
||||
|
||||
public static SelectorFactory getInstatnce() {
|
||||
return INSTATNCE;
|
||||
}
|
||||
|
||||
public RegexSelector newRegexSelector(String regex) {
|
||||
return newSelector(RegexSelector.class, regex);
|
||||
}
|
||||
|
||||
public RegexSelector newRegexSelector(String regex, int group) {
|
||||
String cacheKey = getCacheKey(RegexSelector.class, regex, String.valueOf(group));
|
||||
if (innerCache.get(cacheKey) != null) {
|
||||
return (RegexSelector) innerCache.get(cacheKey);
|
||||
}
|
||||
return new RegexSelector(regex, group);
|
||||
}
|
||||
|
||||
public ReplaceSelector newReplaceSelector(String regex, String replacement) {
|
||||
return newSelector(ReplaceSelector.class, regex, replacement);
|
||||
}
|
||||
|
||||
public XpathSelector newXpathSelector(String xpath) {
|
||||
return newSelector(XpathSelector.class, xpath);
|
||||
}
|
||||
|
||||
public SmartContentSelector newSmartContentSelector() {
|
||||
return newSelector(SmartContentSelector.class);
|
||||
}
|
||||
|
||||
public <T extends Selector> T newAndCacheSelector(Class<T> clazz, String... param) {
|
||||
String cacheKey = getCacheKey(RegexSelector.class, param);
|
||||
if (innerCache.get(cacheKey) != null) {
|
||||
return (T) innerCache.get(cacheKey);
|
||||
}
|
||||
T selector = newSelector(clazz, param);
|
||||
if (selector != null) {
|
||||
innerCache.put(cacheKey, selector);
|
||||
}
|
||||
return selector;
|
||||
|
||||
}
|
||||
|
||||
public <T extends Selector> T newSelector(Class<T> clazz, String... param) {
|
||||
try {
|
||||
if (param.length == 0) {
|
||||
Constructor<T> constructor
|
||||
= clazz.getConstructor();
|
||||
T selector = constructor.newInstance();
|
||||
return selector;
|
||||
} else if (param.length == 1) {
|
||||
Constructor<T> constructor
|
||||
= clazz.getConstructor(String.class);
|
||||
T selector = constructor.newInstance(param[0]);
|
||||
return selector;
|
||||
} else if (param.length == 2) {
|
||||
Constructor<T> constructor
|
||||
= clazz.getConstructor(String.class, String.class);
|
||||
T selector = constructor.newInstance(param[0], param[1]);
|
||||
return selector;
|
||||
} else {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
} catch (Exception e) {
|
||||
throw new IllegalArgumentException("init object error", e);
|
||||
}
|
||||
}
|
||||
|
||||
private String getCacheKey(Class<?> clazz, String... param) {
|
||||
return clazz.toString() + "_" + StringUtils.join(param, "_");
|
||||
}
|
||||
|
||||
}
|
@ -0,0 +1,28 @@
|
||||
package us.codecraft.webmagic;
|
||||
|
||||
import org.junit.Ignore;
|
||||
import org.junit.Test;
|
||||
import us.codecraft.webmagic.pipeline.Pipeline;
|
||||
import us.codecraft.webmagic.processor.SimplePageProcessor;
|
||||
|
||||
/**
|
||||
* @author code4crafter@gmail.com
|
||||
*/
|
||||
public class SpiderTest {
|
||||
|
||||
@Ignore("long time")
|
||||
@Test
|
||||
public void testStartAndStop() throws InterruptedException {
|
||||
Spider spider = Spider.create(new SimplePageProcessor("http://www.oschina.net/", "http://www.oschina.net/*")).addPipeline(new Pipeline() {
|
||||
@Override
|
||||
public void process(ResultItems resultItems, Task task) {
|
||||
System.out.println(1);
|
||||
}
|
||||
});
|
||||
spider.start();
|
||||
Thread.sleep(10000);
|
||||
spider.stop();
|
||||
// spider.run();
|
||||
Thread.sleep(10000);
|
||||
}
|
||||
}
|
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,14 @@
|
||||
package us.codecraft.webmagic;
|
||||
|
||||
import junit.framework.Assert;
|
||||
import us.codecraft.webmagic.model.PageModelPipeline;
|
||||
|
||||
/**
|
||||
* @author code4crafter@gmail.com
|
||||
*/
|
||||
public class MockPageModelPipeline implements PageModelPipeline{
|
||||
@Override
|
||||
public void process(Object o, Task task) {
|
||||
Assert.assertNotNull(o);
|
||||
}
|
||||
}
|
@ -0,0 +1,13 @@
|
||||
package us.codecraft.webmagic;
|
||||
|
||||
import us.codecraft.webmagic.pipeline.Pipeline;
|
||||
|
||||
/**
|
||||
* @author code4crafter@gmail.com
|
||||
*/
|
||||
public class MockPipeline implements Pipeline{
|
||||
@Override
|
||||
public void process(ResultItems resultItems, Task task) {
|
||||
|
||||
}
|
||||
}
|
@ -0,0 +1,87 @@
|
||||
package us.codecraft.webmagic.model;
|
||||
|
||||
import junit.framework.Assert;
|
||||
import org.junit.Test;
|
||||
import us.codecraft.webmagic.MockDownloader;
|
||||
import us.codecraft.webmagic.Site;
|
||||
import us.codecraft.webmagic.Task;
|
||||
import us.codecraft.webmagic.model.annotation.ExtractBy;
|
||||
import us.codecraft.webmagic.model.annotation.ExtractByUrl;
|
||||
import us.codecraft.webmagic.model.annotation.HelpUrl;
|
||||
import us.codecraft.webmagic.model.annotation.TargetUrl;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* @author code4crafter@gmail.com <br>
|
||||
*/
|
||||
@TargetUrl("https://github.com/\\w+/\\w+")
|
||||
@HelpUrl({"https://github.com/\\w+\\?tab=repositories", "https://github.com/\\w+", "https://github.com/explore/*"})
|
||||
public class GithubRepo implements HasKey {
|
||||
|
||||
@ExtractBy(value = "//h1[@class='entry-title public']/strong/a/text()", notNull = true)
|
||||
private String name;
|
||||
|
||||
@ExtractByUrl("https://github\\.com/(\\w+)/.*")
|
||||
private String author;
|
||||
|
||||
@ExtractBy("//div[@id='readme']")
|
||||
private String readme;
|
||||
|
||||
@ExtractBy(value = "//div[@class='repository-lang-stats']//li//span[@class='lang']", multi = true)
|
||||
private List<String> language;
|
||||
|
||||
@ExtractBy("//ul[@class='pagehead-actions']/li[2]//a[@class='social-count js-social-count']/text()")
|
||||
private String star;
|
||||
|
||||
@ExtractBy("//ul[@class='pagehead-actions']/li[3]//a[@class='social-count']/text()")
|
||||
private String fork;
|
||||
|
||||
@ExtractByUrl
|
||||
private String url;
|
||||
|
||||
@Test
|
||||
public void test() {
|
||||
OOSpider.create(Site.me().addStartUrl("https://github.com/code4craft/webmagic").setSleepTime(0)
|
||||
, new PageModelPipeline<GithubRepo>() {
|
||||
@Override
|
||||
public void process(GithubRepo o, Task task) {
|
||||
Assert.assertEquals("78",o.getStar().trim());
|
||||
Assert.assertEquals("65",o.getFork().trim());
|
||||
}
|
||||
}, GithubRepo.class).setDownloader(new MockDownloader()).test("https://github.com/code4craft/webmagic");
|
||||
}
|
||||
|
||||
@Override
|
||||
public String key() {
|
||||
return author + ":" + name;
|
||||
}
|
||||
|
||||
public String getName() {
|
||||
return name;
|
||||
}
|
||||
|
||||
public String getReadme() {
|
||||
return readme;
|
||||
}
|
||||
|
||||
public String getAuthor() {
|
||||
return author;
|
||||
}
|
||||
|
||||
public List<String> getLanguage() {
|
||||
return language;
|
||||
}
|
||||
|
||||
public String getUrl() {
|
||||
return url;
|
||||
}
|
||||
|
||||
public String getStar() {
|
||||
return star;
|
||||
}
|
||||
|
||||
public String getFork() {
|
||||
return fork;
|
||||
}
|
||||
}
|
@ -0,0 +1,35 @@
|
||||
package us.codecraft.webmagic.processor;
|
||||
|
||||
import junit.framework.Assert;
|
||||
import org.junit.Test;
|
||||
import us.codecraft.webmagic.*;
|
||||
import us.codecraft.webmagic.model.OOSpider;
|
||||
import us.codecraft.webmagic.pipeline.Pipeline;
|
||||
|
||||
/**
|
||||
* @author code4crafter@gmail.com
|
||||
*/
|
||||
public class GithubRepoProcessor implements PageProcessor {
|
||||
@Override
|
||||
public void process(Page page) {
|
||||
page.putField("star",page.getHtml().xpath("//ul[@class='pagehead-actions']/li[2]//a[@class='social-count js-social-count']/text()").toString());
|
||||
page.putField("fork",page.getHtml().xpath("//ul[@class='pagehead-actions']/li[3]//a[@class='social-count']/text()").toString());
|
||||
}
|
||||
|
||||
@Override
|
||||
public Site getSite() {
|
||||
return Site.me().addStartUrl("https://github.com/code4craft/webmagic");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void test() {
|
||||
OOSpider.create(new GithubRepoProcessor()).addPipeline(new Pipeline() {
|
||||
@Override
|
||||
public void process(ResultItems resultItems, Task task) {
|
||||
Assert.assertEquals("78",((String)resultItems.get("star")).trim());
|
||||
Assert.assertEquals("65",((String)resultItems.get("fork")).trim());
|
||||
}
|
||||
}).setDownloader(new MockDownloader()).test("https://github.com/code4craft/webmagic");
|
||||
}
|
||||
|
||||
}
|
@ -1,8 +0,0 @@
|
||||
#!/bin/sh
|
||||
touch wordpress.xml
|
||||
cat wp-head.xml >> wordpress.xml
|
||||
for f in `ls`;
|
||||
do
|
||||
cat ${f} >> ../wordpress.xml
|
||||
done;
|
||||
cat wp-bottom.xml >> wordpress.xml
|
@ -1,22 +0,0 @@
|
||||
<item>
|
||||
<title>${title}</title>
|
||||
<link>http://127.0.0.1/wordpress/?p=${id}</link>
|
||||
<pubDate>${date}</pubDate>
|
||||
<dc:creator>admin</dc:creator>
|
||||
<guid isPermaLink="false">http://127.0.0.1/wordpress/?p=${id}</guid>
|
||||
<description></description>
|
||||
<content:encoded><![CDATA[${content}]]></content:encoded>
|
||||
<excerpt:encoded><![CDATA[]]></excerpt:encoded>
|
||||
<wp:post_id>${id}</wp:post_id>
|
||||
<wp:post_date>${date}</wp:post_date>
|
||||
<wp:post_date_gmt>${date}</wp:post_date_gmt>
|
||||
<wp:comment_status>open</wp:comment_status>
|
||||
<wp:ping_status>open</wp:ping_status>
|
||||
<wp:post_name>${title}</wp:post_name>
|
||||
<wp:status>publish</wp:status>
|
||||
<wp:post_parent>0</wp:post_parent>
|
||||
<wp:menu_order>0</wp:menu_order>
|
||||
<wp:post_type>post</wp:post_type>
|
||||
<wp:post_password></wp:post_password>
|
||||
<wp:is_sticky>0</wp:is_sticky>
|
||||
</item>
|
@ -1,2 +0,0 @@
|
||||
</channel>
|
||||
</rss>
|
@ -1,35 +0,0 @@
|
||||
<?xml version="1.0" encoding="UTF-8" ?>
|
||||
<!-- This is a WordPress eXtended RSS file generated by WordPress as an export of your site. -->
|
||||
<!-- It contains information about your site's posts, pages, comments, categories, and other content. -->
|
||||
<!-- You may use this file to transfer that content from one site to another. -->
|
||||
<!-- This file is not intended to serve as a complete backup of your site. -->
|
||||
|
||||
<!-- To import this information into a WordPress site follow these steps: -->
|
||||
<!-- 1. Log in to that site as an administrator. -->
|
||||
<!-- 2. Go to Tools: Import in the WordPress admin panel. -->
|
||||
<!-- 3. Install the "WordPress" importer from the list. -->
|
||||
<!-- 4. Activate & Run Importer. -->
|
||||
<!-- 5. Upload this file using the form provided on that page. -->
|
||||
<!-- 6. You will first be asked to map the authors in this export file to users -->
|
||||
<!-- on the site. For each author, you may choose to map to an -->
|
||||
<!-- existing user on the site or to create a new user. -->
|
||||
<!-- 7. WordPress will then import each of the posts, pages, comments, categories, etc. -->
|
||||
<!-- contained in this file into your site. -->
|
||||
|
||||
<!-- generator="WordPress/3.3.1" created="2012-06-10 09:15" -->
|
||||
<rss version="2.0"
|
||||
xmlns:excerpt="http://wordpress.org/export/1.1/excerpt/"
|
||||
xmlns:content="http://purl.org/rss/1.0/modules/content/"
|
||||
xmlns:wfw="http://wellformedweb.org/CommentAPI/"
|
||||
xmlns:dc="http://purl.org/dc/elements/1.1/"
|
||||
xmlns:wp="http://wordpress.org/export/1.1/"
|
||||
>
|
||||
<channel>
|
||||
<wp:wxr_version>1.1</wp:wxr_version>
|
||||
<wp:base_site_url>http://127.0.0.1/wordpress</wp:base_site_url>
|
||||
<wp:base_blog_url>http://127.0.0.1/wordpress</wp:base_blog_url>
|
||||
|
||||
<wp:author><wp:author_id>1</wp:author_id><wp:author_login>admin</wp:author_login><wp:author_email>flashsword20@163.com</wp:author_email><wp:author_display_name><![CDATA[admin]]></wp:author_display_name><wp:author_first_name><![CDATA[]]></wp:author_first_name><wp:author_last_name><![CDATA[]]></wp:author_last_name></wp:author>
|
||||
|
||||
|
||||
<generator>http://wordpress.org/?v=3.3.1</generator>
|
@ -1,28 +0,0 @@
|
||||
package us.codecraft.webmagic.processor;
|
||||
|
||||
import org.junit.Ignore;
|
||||
import org.junit.Test;
|
||||
import us.codecraft.webmagic.Spider;
|
||||
import us.codecraft.webmagic.pipeline.FilePipeline;
|
||||
import us.codecraft.webmagic.pipeline.JsonFilePipeline;
|
||||
import us.codecraft.webmagic.samples.DiaoyuwengProcessor;
|
||||
import us.codecraft.webmagic.scheduler.FileCacheQueueScheduler;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
/**
|
||||
* @author code4crafter@gmail.com <br>
|
||||
* Date: 13-6-9
|
||||
* Time: 上午8:02
|
||||
*/
|
||||
public class DiaoyuwengProcessorTest {
|
||||
|
||||
@Ignore
|
||||
@Test
|
||||
public void test() throws IOException {
|
||||
DiaoyuwengProcessor diaoyuwengProcessor = new DiaoyuwengProcessor();
|
||||
JsonFilePipeline pipeline = new JsonFilePipeline("/data/webmagic/");
|
||||
Spider.create(diaoyuwengProcessor).pipeline(new FilePipeline()).pipeline(pipeline).scheduler(new FileCacheQueueScheduler("/data/temp/webmagic/cache/")).
|
||||
run();
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue