update to 0.3.1

pull/358/head
yihua.huang 12 years ago
commit a2fba8caa2

1
.gitignore vendored

@ -1,3 +1,4 @@
target/*
*.iml
out/
.idea

@ -6,7 +6,7 @@
<version>7</version>
</parent>
<groupId>us.codecraft</groupId>
<version>0.3.0</version>
<version>0.3.1</version>
<modelVersion>4.0.0</modelVersion>
<packaging>pom</packaging>
<properties>
@ -109,6 +109,14 @@
<build>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-surefire-plugin</artifactId>
<configuration>
<forkMode>pertest</forkMode>
<argLine>-Xms1024m -Xmx1024m -Xss1m </argLine>
</configuration>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>

@ -1,11 +0,0 @@
#release configuration
#Tue Aug 20 23:36:56 CST 2013
scm.tagNameFormat=@{project.artifactId}-@{project.version}
pushChanges=true
scm.url=scm\:git\:git@github.com\:code4craft/webmagic.git
preparationGoals=clean verify
remoteTagging=true
scm.commentPrefix=[maven-release-plugin]
exec.additionalArguments=-Psonatype-oss-release -P development
exec.snapshotReleasePluginAllowed=false
completedPhase=check-poms

@ -3,7 +3,7 @@
<parent>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-parent</artifactId>
<version>0.3.0</version>
<version>0.3.1</version>
</parent>
<modelVersion>4.0.0</modelVersion>

@ -79,22 +79,22 @@ public class Spider implements Runnable, Task {
* create a spider with pageProcessor.
*
* @param pageProcessor
* @return new spider
* @see PageProcessor
*/
public Spider(PageProcessor pageProcessor) {
this.pageProcessor = pageProcessor;
this.site = pageProcessor.getSite();
this.startUrls = pageProcessor.getSite().getStartUrls();
public static Spider create(PageProcessor pageProcessor) {
return new Spider(pageProcessor);
}
/**
* create a spider with pageProcessor.
*
* @param pageProcessor
* @return new spider
* @see PageProcessor
*/
public static Spider create(PageProcessor pageProcessor) {
return new Spider(pageProcessor);
public Spider(PageProcessor pageProcessor) {
this.pageProcessor = pageProcessor;
this.site = pageProcessor.getSite();
this.startUrls = pageProcessor.getSite().getStartUrls();
}
/**
@ -105,7 +105,7 @@ public class Spider implements Runnable, Task {
* @return this
*/
public Spider startUrls(List<String> startUrls) {
checkIfNotRunning();
checkIfRunning();
this.startUrls = startUrls;
return this;
}
@ -139,11 +139,11 @@ public class Spider implements Runnable, Task {
*
* @param scheduler
* @return this
* @since 0.2.1
* @see Scheduler
* @since 0.2.1
*/
public Spider setScheduler(Scheduler scheduler) {
checkIfNotRunning();
checkIfRunning();
this.scheduler = scheduler;
return this;
}
@ -153,8 +153,8 @@ public class Spider implements Runnable, Task {
*
* @param pipeline
* @return this
* @deprecated
* @see #setPipeline(us.codecraft.webmagic.pipeline.Pipeline)
* @deprecated
*/
public Spider pipeline(Pipeline pipeline) {
return addPipeline(pipeline);
@ -165,11 +165,11 @@ public class Spider implements Runnable, Task {
*
* @param pipeline
* @return this
* @since 0.2.1
* @see Pipeline
* @since 0.2.1
*/
public Spider addPipeline(Pipeline pipeline) {
checkIfNotRunning();
checkIfRunning();
this.pipelines.add(pipeline);
return this;
}
@ -189,8 +189,8 @@ public class Spider implements Runnable, Task {
*
* @param downloader
* @return this
* @deprecated
* @see #setDownloader(us.codecraft.webmagic.downloader.Downloader)
* @deprecated
*/
public Spider downloader(Downloader downloader) {
return setDownloader(downloader);
@ -198,12 +198,13 @@ public class Spider implements Runnable, Task {
/**
* set the downloader of spider
* @see Downloader
*
* @param downloader
* @return this
* @see Downloader
*/
public Spider setDownloader(Downloader downloader) {
checkIfNotRunning();
checkIfRunning();
this.downloader = downloader;
return this;
}
@ -220,7 +221,8 @@ public class Spider implements Runnable, Task {
@Override
public void run() {
if (!stat.compareAndSet(STAT_INIT, STAT_RUNNING)) {
if (!stat.compareAndSet(STAT_INIT, STAT_RUNNING)
&& !stat.compareAndSet(STAT_STOPPED, STAT_RUNNING)) {
throw new IllegalStateException("Spider is already running!");
}
checkComponent();
@ -228,18 +230,19 @@ public class Spider implements Runnable, Task {
for (String startUrl : startUrls) {
scheduler.push(new Request(startUrl), this);
}
startUrls.clear();
}
Request request = scheduler.poll(this);
//singel thread
//single thread
if (executorService == null) {
while (request != null) {
while (request != null && stat.compareAndSet(STAT_RUNNING, STAT_RUNNING)) {
processRequest(request);
request = scheduler.poll(this);
}
} else {
//multi thread
final AtomicInteger threadAlive = new AtomicInteger(0);
while (true) {
while (true && stat.compareAndSet(STAT_RUNNING, STAT_RUNNING)) {
if (request == null) {
//when no request found but some thread is alive, sleep a while.
try {
@ -311,7 +314,7 @@ public class Spider implements Runnable, Task {
return;
}
//for cycle retry
if (page.getHtml()==null){
if (page.getHtml() == null) {
addRequest(page);
sleep(site.getSleepTime());
return;
@ -342,8 +345,8 @@ public class Spider implements Runnable, Task {
}
}
protected void checkIfNotRunning() {
if (!stat.compareAndSet(STAT_INIT, STAT_INIT)) {
protected void checkIfRunning() {
if (!stat.compareAndSet(STAT_INIT, STAT_INIT) && !stat.compareAndSet(STAT_STOPPED, STAT_STOPPED)) {
throw new IllegalStateException("Spider is already running!");
}
}
@ -354,6 +357,19 @@ public class Spider implements Runnable, Task {
thread.start();
}
public void start() {
runAsync();
}
public void stop() {
stat.compareAndSet(STAT_RUNNING, STAT_STOPPED);
}
public void stopAndDestroy() {
stat.compareAndSet(STAT_RUNNING, STAT_STOPPED);
destroy();
}
/**
* start with more than one threads
*
@ -361,7 +377,7 @@ public class Spider implements Runnable, Task {
* @return this
*/
public Spider thread(int threadNum) {
checkIfNotRunning();
checkIfRunning();
this.threadNum = threadNum;
if (threadNum <= 0) {
throw new IllegalArgumentException("threadNum should be more than one!");
@ -377,9 +393,10 @@ public class Spider implements Runnable, Task {
/**
* switch off xsoup
*
* @return
*/
public static void xsoupOff(){
public static void xsoupOff() {
EnvironmentUtil.setUseXsoup(false);
}

@ -2,22 +2,30 @@ package us.codecraft.webmagic.selector;
import org.jsoup.Jsoup;
import java.util.ArrayList;
import java.util.List;
/**
* @author code4crafter@gmail.com
* @since 0.3.0
*/
public abstract class BaseElementSelector implements Selector,ElementSelector {
public abstract class BaseElementSelector implements Selector, ElementSelector {
@Override
public String select(String text) {
return select(Jsoup.parse(text));
if (text != null) {
return select(Jsoup.parse(text));
}
return null;
}
@Override
public List<String> selectList(String text) {
return selectList(Jsoup.parse(text));
if (text != null) {
return selectList(Jsoup.parse(text));
} else {
return new ArrayList<String>();
}
}
}

@ -1,91 +0,0 @@
package us.codecraft.webmagic.selector;
import org.apache.commons.lang3.StringUtils;
import java.lang.reflect.Constructor;
import java.util.Map;
import java.util.concurrent.ConcurrentHashMap;
/**
* Selector factory with some inner cache.<br>
*
* @author code4crafter@gmail.com <br>
* @since 0.1.0
*/
public class SelectorFactory {
private Map<String, Selector> innerCache = new ConcurrentHashMap<String, Selector>();
private static final SelectorFactory INSTATNCE = new SelectorFactory();
public static SelectorFactory getInstatnce() {
return INSTATNCE;
}
public RegexSelector newRegexSelector(String regex) {
return newSelector(RegexSelector.class, regex);
}
public RegexSelector newRegexSelector(String regex, int group) {
String cacheKey = getCacheKey(RegexSelector.class, regex, String.valueOf(group));
if (innerCache.get(cacheKey) != null) {
return (RegexSelector) innerCache.get(cacheKey);
}
return new RegexSelector(regex, group);
}
public ReplaceSelector newReplaceSelector(String regex, String replacement) {
return newSelector(ReplaceSelector.class, regex, replacement);
}
public XpathSelector newXpathSelector(String xpath) {
return newSelector(XpathSelector.class, xpath);
}
public SmartContentSelector newSmartContentSelector() {
return newSelector(SmartContentSelector.class);
}
public <T extends Selector> T newAndCacheSelector(Class<T> clazz, String... param) {
String cacheKey = getCacheKey(RegexSelector.class, param);
if (innerCache.get(cacheKey) != null) {
return (T) innerCache.get(cacheKey);
}
T selector = newSelector(clazz, param);
if (selector != null) {
innerCache.put(cacheKey, selector);
}
return selector;
}
public <T extends Selector> T newSelector(Class<T> clazz, String... param) {
try {
if (param.length == 0) {
Constructor<T> constructor
= clazz.getConstructor();
T selector = constructor.newInstance();
return selector;
} else if (param.length == 1) {
Constructor<T> constructor
= clazz.getConstructor(String.class);
T selector = constructor.newInstance(param[0]);
return selector;
} else if (param.length == 2) {
Constructor<T> constructor
= clazz.getConstructor(String.class, String.class);
T selector = constructor.newInstance(param[0], param[1]);
return selector;
} else {
throw new UnsupportedOperationException();
}
} catch (Exception e) {
throw new IllegalArgumentException("init object error", e);
}
}
private String getCacheKey(Class<?> clazz, String... param) {
return clazz.toString() + "_" + StringUtils.join(param, "_");
}
}

@ -2,6 +2,8 @@ package us.codecraft.webmagic.utils;
import org.apache.commons.lang3.StringUtils;
import java.net.MalformedURLException;
import java.net.URL;
import java.nio.charset.Charset;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
@ -18,47 +20,33 @@ public class UrlUtils {
/**
* canonicalizeUrl
*
* Borrowed from Jsoup.
*
* @param url
* @param refer
* @return canonicalizeUrl
*/
public static String canonicalizeUrl(String url, String refer) {
if (StringUtils.isBlank(url) || StringUtils.isBlank(refer)) {
return url;
}
if (url.startsWith("http") || url.startsWith("ftp") || url.startsWith("mailto") || url.startsWith("javascript:")) {
return url;
}
if (StringUtils.startsWith(url, "/")) {
String host = getHost(refer);
return host + url;
} else if (!StringUtils.startsWith(url, ".")) {
refer = reversePath(refer, 1);
return refer + "/" + url;
} else {
Matcher matcher = relativePathPattern.matcher(url);
if (matcher.find()) {
int reverseDepth = matcher.group(1).length();
refer = reversePath(refer, reverseDepth);
String substring = StringUtils.substring(url, matcher.end());
return refer + "/" + substring;
} else {
refer = reversePath(refer, 1);
return refer + "/" + url;
URL base;
try {
try {
base = new URL(refer);
} catch (MalformedURLException e) {
// the base is unsuitable, but the attribute may be abs on its own, so try that
URL abs = new URL(refer);
return abs.toExternalForm();
}
// workaround: java resolves '//path/file + ?foo' to '//path/?foo', not '//path/file?foo' as desired
if (url.startsWith("?"))
url = base.getPath() + url;
URL abs = new URL(base, url);
return abs.toExternalForm();
} catch (MalformedURLException e) {
return "";
}
}
public static String reversePath(String url, int depth) {
int i = StringUtils.lastOrdinalIndexOf(url, "/", depth);
if (i < 10) {
url = getHost(url);
} else {
url = StringUtils.substring(url, 0, i);
}
return url;
}
public static String getHost(String url) {
String host = url;
int i = StringUtils.ordinalIndexOf(url, "/", 3);

@ -0,0 +1,28 @@
package us.codecraft.webmagic;
import org.junit.Ignore;
import org.junit.Test;
import us.codecraft.webmagic.pipeline.Pipeline;
import us.codecraft.webmagic.processor.SimplePageProcessor;
/**
* @author code4crafter@gmail.com
*/
public class SpiderTest {
@Ignore("long time")
@Test
public void testStartAndStop() throws InterruptedException {
Spider spider = Spider.create(new SimplePageProcessor("http://www.oschina.net/", "http://www.oschina.net/*")).addPipeline(new Pipeline() {
@Override
public void process(ResultItems resultItems, Task task) {
System.out.println(1);
}
});
spider.start();
Thread.sleep(10000);
spider.stop();
// spider.run();
Thread.sleep(10000);
}
}

@ -19,13 +19,12 @@ public class UrlUtilsTest {
fixrelativeurl = UrlUtils.canonicalizeUrl("../aa", "http://www.dianping.com/sh/ss/com");
Assert.assertEquals("http://www.dianping.com/sh/aa", fixrelativeurl);
fixrelativeurl = UrlUtils.canonicalizeUrl("..../aa", "http://www.dianping.com/sh/ss/com");
Assert.assertEquals("http://www.dianping.com/aa", fixrelativeurl);
fixrelativeurl = UrlUtils.canonicalizeUrl(".../aa", "http://www.dianping.com/sh/ss/com");
Assert.assertEquals("http://www.dianping.com/aa", fixrelativeurl);
fixrelativeurl = UrlUtils.canonicalizeUrl("..aa", "http://www.dianping.com/sh/ss/com");
Assert.assertEquals("http://www.dianping.com/sh/ss/..aa", fixrelativeurl);
fixrelativeurl = UrlUtils.canonicalizeUrl("../../aa", "http://www.dianping.com/sh/ss/com/");
Assert.assertEquals("http://www.dianping.com/sh/aa", fixrelativeurl);
fixrelativeurl = UrlUtils.canonicalizeUrl("../../aa", "http://www.dianping.com/sh/ss/com");
Assert.assertEquals("http://www.dianping.com/aa", fixrelativeurl);
}
@Test

@ -3,7 +3,7 @@
<parent>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-parent</artifactId>
<version>0.3.0</version>
<version>0.3.1</version>
</parent>
<modelVersion>4.0.0</modelVersion>

@ -184,7 +184,7 @@ class PageModelExtractor {
return null;
}
if (objectExtractor == null) {
return processSingle(page, null, false);
return processSingle(page, null, true);
} else {
if (objectExtractor.multi) {
List<Object> os = new ArrayList<Object>();

@ -0,0 +1,14 @@
package us.codecraft.webmagic;
import junit.framework.Assert;
import us.codecraft.webmagic.model.PageModelPipeline;
/**
* @author code4crafter@gmail.com
*/
public class MockPageModelPipeline implements PageModelPipeline{
@Override
public void process(Object o, Task task) {
Assert.assertNotNull(o);
}
}

@ -0,0 +1,13 @@
package us.codecraft.webmagic;
import us.codecraft.webmagic.pipeline.Pipeline;
/**
* @author code4crafter@gmail.com
*/
public class MockPipeline implements Pipeline{
@Override
public void process(ResultItems resultItems, Task task) {
}
}

@ -0,0 +1,87 @@
package us.codecraft.webmagic.model;
import junit.framework.Assert;
import org.junit.Test;
import us.codecraft.webmagic.MockDownloader;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.model.annotation.ExtractBy;
import us.codecraft.webmagic.model.annotation.ExtractByUrl;
import us.codecraft.webmagic.model.annotation.HelpUrl;
import us.codecraft.webmagic.model.annotation.TargetUrl;
import java.util.List;
/**
* @author code4crafter@gmail.com <br>
*/
@TargetUrl("https://github.com/\\w+/\\w+")
@HelpUrl({"https://github.com/\\w+\\?tab=repositories", "https://github.com/\\w+", "https://github.com/explore/*"})
public class GithubRepo implements HasKey {
@ExtractBy(value = "//h1[@class='entry-title public']/strong/a/text()", notNull = true)
private String name;
@ExtractByUrl("https://github\\.com/(\\w+)/.*")
private String author;
@ExtractBy("//div[@id='readme']")
private String readme;
@ExtractBy(value = "//div[@class='repository-lang-stats']//li//span[@class='lang']", multi = true)
private List<String> language;
@ExtractBy("//ul[@class='pagehead-actions']/li[2]//a[@class='social-count js-social-count']/text()")
private String star;
@ExtractBy("//ul[@class='pagehead-actions']/li[3]//a[@class='social-count']/text()")
private String fork;
@ExtractByUrl
private String url;
@Test
public void test() {
OOSpider.create(Site.me().addStartUrl("https://github.com/code4craft/webmagic").setSleepTime(0)
, new PageModelPipeline<GithubRepo>() {
@Override
public void process(GithubRepo o, Task task) {
Assert.assertEquals("78",o.getStar().trim());
Assert.assertEquals("65",o.getFork().trim());
}
}, GithubRepo.class).setDownloader(new MockDownloader()).test("https://github.com/code4craft/webmagic");
}
@Override
public String key() {
return author + ":" + name;
}
public String getName() {
return name;
}
public String getReadme() {
return readme;
}
public String getAuthor() {
return author;
}
public List<String> getLanguage() {
return language;
}
public String getUrl() {
return url;
}
public String getStar() {
return star;
}
public String getFork() {
return fork;
}
}

@ -0,0 +1,35 @@
package us.codecraft.webmagic.processor;
import junit.framework.Assert;
import org.junit.Test;
import us.codecraft.webmagic.*;
import us.codecraft.webmagic.model.OOSpider;
import us.codecraft.webmagic.pipeline.Pipeline;
/**
* @author code4crafter@gmail.com
*/
public class GithubRepoProcessor implements PageProcessor {
@Override
public void process(Page page) {
page.putField("star",page.getHtml().xpath("//ul[@class='pagehead-actions']/li[2]//a[@class='social-count js-social-count']/text()").toString());
page.putField("fork",page.getHtml().xpath("//ul[@class='pagehead-actions']/li[3]//a[@class='social-count']/text()").toString());
}
@Override
public Site getSite() {
return Site.me().addStartUrl("https://github.com/code4craft/webmagic");
}
@Test
public void test() {
OOSpider.create(new GithubRepoProcessor()).addPipeline(new Pipeline() {
@Override
public void process(ResultItems resultItems, Task task) {
Assert.assertEquals("78",((String)resultItems.get("star")).trim());
Assert.assertEquals("65",((String)resultItems.get("fork")).trim());
}
}).setDownloader(new MockDownloader()).test("https://github.com/code4craft/webmagic");
}
}

@ -5,7 +5,7 @@
<parent>
<artifactId>webmagic-parent</artifactId>
<groupId>us.codecraft</groupId>
<version>0.3.0</version>
<version>0.3.1</version>
</parent>
<modelVersion>4.0.0</modelVersion>

@ -14,8 +14,6 @@ import java.util.Scanner;
/**
* @author code4crafter@gmail.com <br>
* Date: 13-8-7 <br>
* Time: 9:24 <br>
*/
public class QuickStarter {

@ -14,8 +14,6 @@ import java.util.List;
/**
* @author code4crafter@gmail.com <br>
* Date: 13-8-10 <br>
* Time: 6:37 <br>
*/
@TargetUrl("https://github.com/\\w+/\\w+")
@HelpUrl({"https://github.com/\\w+\\?tab=repositories","https://github.com/\\w+","https://github.com/explore/*"})

@ -28,7 +28,7 @@ public class IteyeBlog implements Blog{
}
public static void main(String[] args) {
OOSpider.create(Site.me().addStartUrl("http://*.iteye.com/blog"), IteyeBlog.class).run();
OOSpider.create(Site.me().addStartUrl("http://flashsword20.iteye.com/blog"), IteyeBlog.class).run();
}
public String getTitle() {

@ -10,8 +10,6 @@ import us.codecraft.webmagic.model.annotation.TargetUrl;
/**
* @author code4crafter@gmail.com <br>
* Date: 13-8-11 <br>
* Time: 9:29 <br>
*/
@TargetUrl("http://www.36kr.com/p/\\d+.html")
@HelpUrl("http://www.36kr.com/#/page/\\d+")

@ -16,8 +16,6 @@ import java.util.List;
/**
* @author code4crafter@gmail.com <br>
* Date: 13-8-4 <br>
* Time: 8:17 <br>
*/
@TargetUrl("http://news.163.com/\\d+/\\d+/\\d+/\\w+*.html")
public class News163 implements MultiPageModel {

@ -9,8 +9,6 @@ import us.codecraft.webmagic.model.annotation.TargetUrl;
/**
* @author code4crafter@gmail.com <br>
* Date: 13-8-3 <br>
* Time: 8:25 <br>
*/
@TargetUrl("http://www.oschina.net/question/\\d+_\\d+*")
@HelpUrl("http://www.oschina.net/question/*")

@ -11,8 +11,6 @@ import java.util.List;
/**
* @author code4crafter@gmail.com <br>
* Date: 13-8-2 <br>
* Time: 7:52 <br>
*/
@TargetUrl("http://my.oschina.net/flashsword/blog/\\d+")
public class OschinaBlog implements HasKey{

@ -8,8 +8,6 @@ import java.util.List;
/**
* @author code4crafter@gmail.com <br>
* Date: 13-4-21
* Time: 8:08
*/
public class DiandianBlogProcessor implements PageProcessor {

@ -9,8 +9,6 @@ import java.util.List;
/**
* @author code4crafter@gmail.com <br>
* Date: 13-4-21
* Time: 8:08
*/
public class HuxiuProcessor implements PageProcessor {
@Override
@ -18,13 +16,16 @@ public class HuxiuProcessor implements PageProcessor {
List<String> requests = page.getHtml().links().regex(".*article.*").all();
page.addTargetRequests(requests);
page.putField("title",page.getHtml().xpath("//div[@class='clearfix neirong']//h1/text()"));
page.putField("content",page.getHtml().smartContent());
page.putField("content",page.getHtml().xpath("//div[@id='neirong_box']/tidyText()"));
}
@Override
public Site getSite() {
return Site.me().setDomain("www.huxiu.com").addStartUrl("http://www.huxiu.com/").
setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31");
return Site.me().setDomain("www.huxiu.com").addStartUrl("http://www.huxiu.com/");
}
public static void main(String[] args) {
Spider.create(new HuxiuProcessor()).run();
}
public static void main(String[] args) {

@ -10,8 +10,6 @@ import java.util.List;
/**
* @author code4crafter@gmail.com <br>
* Date: 13-4-21
* Time: 8:08
*/
public class InfoQMiniBookProcessor implements PageProcessor {

@ -7,8 +7,6 @@ import us.codecraft.webmagic.processor.PageProcessor;
/**
* @author code4crafter@gmail.com <br>
* Date: 13-7-26 <br>
* Time: 7:31 <br>
*/
public class IteyeBlogProcessor implements PageProcessor {
@ -24,8 +22,7 @@ public class IteyeBlogProcessor implements PageProcessor {
@Override
public Site getSite() {
if (site == null) {
site = Site.me().setDomain("yanghaoli.iteye.com").addStartUrl("http://yanghaoli.iteye.com/").
setSleepTime(100).setRetryTimes(3);
site = Site.me().setDomain("yanghaoli.iteye.com").addStartUrl("http://yanghaoli.iteye.com/");
}
return site;
}

@ -22,7 +22,6 @@ public class NjuBBSProcessor implements PageProcessor {
@Override
public Site getSite() {
return Site.me().setDomain("bbs.nju.edu.cn").addStartUrl("http://bbs.nju.edu.cn/board?board=Pictures").
setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31");
return Site.me().setDomain("bbs.nju.edu.cn").addStartUrl("http://bbs.nju.edu.cn/board?board=Pictures");
}
}

@ -9,8 +9,6 @@ import java.util.List;
/**
* @author code4crafter@gmail.com <br>
* Date: 13-4-21
* Time: 1:48
*/
public class OschinaBlogPageProcesser implements PageProcessor {

@ -8,8 +8,6 @@ import java.util.List;
/**
* @author code4crafter@gmail.com <br>
* Date: 13-4-21
* Time: 1:48
*/
public class OschinaPageProcesser implements PageProcessor {

@ -8,8 +8,6 @@ import java.util.List;
/**
* @author code4crafter@gmail.com <br>
* Date: 13-4-21
* Time: 8:08
*/
public class QzoneBlogProcessor implements PageProcessor {
@Override

@ -7,8 +7,6 @@ import us.codecraft.webmagic.processor.PageProcessor;
/**
* @author code4crafter@gmail.com <br>
* Date: 13-4-21
* Time: 1:48
*/
public class SinaBlogProcesser implements PageProcessor {

@ -8,8 +8,6 @@ import java.util.List;
/**
* @author code4crafter@gmail.com <br>
* Date: 13-4-21
* Time: 1:48
*/
public class TianyaPageProcesser implements PageProcessor {

@ -1,8 +0,0 @@
#!/bin/sh
touch wordpress.xml
cat wp-head.xml >> wordpress.xml
for f in `ls`;
do
cat ${f} >> ../wordpress.xml
done;
cat wp-bottom.xml >> wordpress.xml

@ -1,22 +0,0 @@
<item>
<title>${title}</title>
<link>http://127.0.0.1/wordpress/?p=${id}</link>
<pubDate>${date}</pubDate>
<dc:creator>admin</dc:creator>
<guid isPermaLink="false">http://127.0.0.1/wordpress/?p=${id}</guid>
<description></description>
<content:encoded><![CDATA[${content}]]></content:encoded>
<excerpt:encoded><![CDATA[]]></excerpt:encoded>
<wp:post_id>${id}</wp:post_id>
<wp:post_date>${date}</wp:post_date>
<wp:post_date_gmt>${date}</wp:post_date_gmt>
<wp:comment_status>open</wp:comment_status>
<wp:ping_status>open</wp:ping_status>
<wp:post_name>${title}</wp:post_name>
<wp:status>publish</wp:status>
<wp:post_parent>0</wp:post_parent>
<wp:menu_order>0</wp:menu_order>
<wp:post_type>post</wp:post_type>
<wp:post_password></wp:post_password>
<wp:is_sticky>0</wp:is_sticky>
</item>

@ -1,35 +0,0 @@
<?xml version="1.0" encoding="UTF-8" ?>
<!-- This is a WordPress eXtended RSS file generated by WordPress as an export of your site. -->
<!-- It contains information about your site's posts, pages, comments, categories, and other content. -->
<!-- You may use this file to transfer that content from one site to another. -->
<!-- This file is not intended to serve as a complete backup of your site. -->
<!-- To import this information into a WordPress site follow these steps: -->
<!-- 1. Log in to that site as an administrator. -->
<!-- 2. Go to Tools: Import in the WordPress admin panel. -->
<!-- 3. Install the "WordPress" importer from the list. -->
<!-- 4. Activate & Run Importer. -->
<!-- 5. Upload this file using the form provided on that page. -->
<!-- 6. You will first be asked to map the authors in this export file to users -->
<!-- on the site. For each author, you may choose to map to an -->
<!-- existing user on the site or to create a new user. -->
<!-- 7. WordPress will then import each of the posts, pages, comments, categories, etc. -->
<!-- contained in this file into your site. -->
<!-- generator="WordPress/3.3.1" created="2012-06-10 09:15" -->
<rss version="2.0"
xmlns:excerpt="http://wordpress.org/export/1.1/excerpt/"
xmlns:content="http://purl.org/rss/1.0/modules/content/"
xmlns:wfw="http://wellformedweb.org/CommentAPI/"
xmlns:dc="http://purl.org/dc/elements/1.1/"
xmlns:wp="http://wordpress.org/export/1.1/"
>
<channel>
<wp:wxr_version>1.1</wp:wxr_version>
<wp:base_site_url>http://127.0.0.1/wordpress</wp:base_site_url>
<wp:base_blog_url>http://127.0.0.1/wordpress</wp:base_blog_url>
<wp:author><wp:author_id>1</wp:author_id><wp:author_login>admin</wp:author_login><wp:author_email>flashsword20@163.com</wp:author_email><wp:author_display_name><![CDATA[admin]]></wp:author_display_name><wp:author_first_name><![CDATA[]]></wp:author_first_name><wp:author_last_name><![CDATA[]]></wp:author_last_name></wp:author>
<generator>http://wordpress.org/?v=3.3.1</generator>

@ -1,28 +0,0 @@
package us.codecraft.webmagic.processor;
import org.junit.Ignore;
import org.junit.Test;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.pipeline.FilePipeline;
import us.codecraft.webmagic.pipeline.JsonFilePipeline;
import us.codecraft.webmagic.samples.DiaoyuwengProcessor;
import us.codecraft.webmagic.scheduler.FileCacheQueueScheduler;
import java.io.IOException;
/**
* @author code4crafter@gmail.com <br>
* Date: 13-6-9
* Time: 8:02
*/
public class DiaoyuwengProcessorTest {
@Ignore
@Test
public void test() throws IOException {
DiaoyuwengProcessor diaoyuwengProcessor = new DiaoyuwengProcessor();
JsonFilePipeline pipeline = new JsonFilePipeline("/data/webmagic/");
Spider.create(diaoyuwengProcessor).pipeline(new FilePipeline()).pipeline(pipeline).scheduler(new FileCacheQueueScheduler("/data/temp/webmagic/cache/")).
run();
}
}
Loading…
Cancel
Save