split modules

pull/17/head
yihua.huang 12 years ago
parent 3c3f001186
commit 6dc88fa111

@ -0,0 +1,105 @@
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<groupId>us.codecraft</groupId>
<version>0.0.1-SNAPSHOT</version>
<modelVersion>4.0.0</modelVersion>
<artifactId>webmagic-core</artifactId>
<dependencies>
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId>
<version>4.2.1</version>
</dependency>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>4.7</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>com.google.guava</groupId>
<artifactId>guava</artifactId>
<version>13.0.1</version>
</dependency>
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-lang3</artifactId>
<version>3.1</version>
</dependency>
<dependency>
<groupId>log4j</groupId>
<artifactId>log4j</artifactId>
<version>1.2.17</version>
</dependency>
<dependency>
<groupId>commons-collections</groupId>
<artifactId>commons-collections</artifactId>
<version>3.2.1</version>
</dependency>
<dependency>
<groupId>net.sourceforge.htmlcleaner</groupId>
<artifactId>htmlcleaner</artifactId>
<version>2.4</version>
</dependency>
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-io</artifactId>
<version>1.3.2</version>
</dependency>
</dependencies>
<build>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-resources-plugin</artifactId>
<configuration>
<encoding>UTF-8</encoding>
</configuration>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-source-plugin</artifactId>
<executions>
<execution>
<id>attach-sources</id>
<goals>
<goal>jar</goal>
</goals>
</execution>
</executions>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-javadoc-plugin</artifactId>
<executions>
<execution>
<id>attach-javadocs</id>
<goals>
<goal>jar</goal>
</goals>
</execution>
</executions>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-release-plugin</artifactId>
<version>2.0-beta-7</version>
</plugin>
</plugins>
</build>
</project>

@ -1,8 +1,8 @@
package us.codecraft.spider;
package us.codecraft.webmagic;
import org.apache.commons.lang3.StringUtils;
import us.codecraft.spider.selector.Selectable;
import us.codecraft.spider.utils.UrlUtils;
import us.codecraft.webmagic.selector.Selectable;
import us.codecraft.webmagic.utils.UrlUtils;
import java.util.ArrayList;
import java.util.List;

@ -1,8 +1,4 @@
package us.codecraft.spider;
import us.codecraft.spider.Site;
import java.util.List;
package us.codecraft.webmagic;
/**
* User: cairne

@ -1,4 +1,4 @@
package us.codecraft.spider;
package us.codecraft.webmagic;
import java.util.HashSet;
import java.util.Set;

@ -1,17 +1,14 @@
package us.codecraft.spider;
package us.codecraft.webmagic;
import org.apache.commons.collections.CollectionUtils;
import org.apache.log4j.Logger;
import us.codecraft.spider.downloader.Downloader;
import us.codecraft.spider.downloader.HttpClientDownloader;
import us.codecraft.spider.pipeline.ConsolePipeline;
import us.codecraft.spider.pipeline.Pipeline;
import us.codecraft.spider.processor.PageProcessor;
import us.codecraft.spider.schedular.QueueSchedular;
import us.codecraft.spider.schedular.Schedular;
import java.util.Map;
import java.util.concurrent.ConcurrentHashMap;
import us.codecraft.webmagic.downloader.Downloader;
import us.codecraft.webmagic.downloader.HttpClientDownloader;
import us.codecraft.webmagic.pipeline.ConsolePipeline;
import us.codecraft.webmagic.pipeline.Pipeline;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.schedular.QueueSchedular;
import us.codecraft.webmagic.schedular.Schedular;
/**
* User: cairne

@ -1,8 +1,8 @@
package us.codecraft.spider.downloader;
package us.codecraft.webmagic.downloader;
import us.codecraft.spider.Page;
import us.codecraft.spider.Request;
import us.codecraft.spider.Site;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Site;
/**
* User: cairne

@ -1,16 +1,16 @@
package us.codecraft.spider.downloader;
package us.codecraft.webmagic.downloader;
import org.apache.commons.io.IOUtils;
import org.apache.http.HttpResponse;
import org.apache.http.client.HttpClient;
import org.apache.http.client.methods.HttpGet;
import org.apache.log4j.Logger;
import us.codecraft.spider.Page;
import us.codecraft.spider.Request;
import us.codecraft.spider.Site;
import us.codecraft.spider.selector.Html;
import us.codecraft.spider.selector.PlainText;
import us.codecraft.spider.utils.UrlUtils;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.selector.Html;
import us.codecraft.webmagic.selector.PlainText;
import us.codecraft.webmagic.utils.UrlUtils;
/**

@ -1,4 +1,4 @@
package us.codecraft.spider.downloader;
package us.codecraft.webmagic.downloader;
import org.apache.http.HttpVersion;
import org.apache.http.client.HttpClient;
@ -10,7 +10,7 @@ import org.apache.http.conn.scheme.SchemeRegistry;
import org.apache.http.impl.client.DefaultHttpClient;
import org.apache.http.impl.conn.PoolingClientConnectionManager;
import org.apache.http.params.*;
import us.codecraft.spider.Site;
import us.codecraft.webmagic.Site;
/**
* User: cairne

@ -1,8 +1,8 @@
package us.codecraft.spider.pipeline;
package us.codecraft.webmagic.pipeline;
import us.codecraft.spider.Page;
import us.codecraft.spider.Site;
import us.codecraft.spider.selector.Selectable;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.selector.Selectable;
import java.util.Map;

@ -1,10 +1,10 @@
package us.codecraft.spider.pipeline;
package us.codecraft.webmagic.pipeline;
import org.apache.commons.codec.digest.DigestUtils;
import org.apache.commons.lang3.StringUtils;
import us.codecraft.spider.Page;
import us.codecraft.spider.Site;
import us.codecraft.spider.selector.Selectable;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.selector.Selectable;
import java.io.File;
import java.io.FileWriter;
@ -19,7 +19,7 @@ import java.util.Map;
*/
public class FilePipeline implements Pipeline {
private String path = "/data/temp/spider/";
private String path = "/data/temp/webmagic/";
public FilePipeline(){

@ -1,7 +1,7 @@
package us.codecraft.spider.pipeline;
package us.codecraft.webmagic.pipeline;
import us.codecraft.spider.Page;
import us.codecraft.spider.Site;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
/**
* User: cairne

@ -1,7 +1,7 @@
package us.codecraft.spider.processor;
package us.codecraft.webmagic.processor;
import us.codecraft.spider.Page;
import us.codecraft.spider.Site;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
/**
* User: cairne

@ -1,8 +1,8 @@
package us.codecraft.spider.processor;
package us.codecraft.webmagic.processor;
import us.codecraft.spider.Page;
import us.codecraft.spider.Site;
import us.codecraft.spider.utils.UrlUtils;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.utils.UrlUtils;
import java.util.List;

@ -1,9 +1,9 @@
package us.codecraft.spider.schedular;
package us.codecraft.webmagic.schedular;
import org.apache.commons.lang3.math.NumberUtils;
import org.apache.log4j.Logger;
import us.codecraft.spider.Site;
import us.codecraft.spider.Request;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Request;
import java.io.*;
import java.util.LinkedHashSet;

@ -1,8 +1,8 @@
package us.codecraft.spider.schedular;
package us.codecraft.webmagic.schedular;
import org.apache.log4j.Logger;
import us.codecraft.spider.Request;
import us.codecraft.spider.Site;
import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Site;
import java.util.HashSet;
import java.util.Set;

@ -1,7 +1,7 @@
package us.codecraft.spider.schedular;
package us.codecraft.webmagic.schedular;
import us.codecraft.spider.Request;
import us.codecraft.spider.Site;
import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Site;
/**
* User: cairne

@ -1,6 +1,4 @@
package us.codecraft.spider.selector;
import org.apache.commons.collections.CollectionUtils;
package us.codecraft.webmagic.selector;
import java.util.ArrayList;
import java.util.List;

@ -1,4 +1,4 @@
package us.codecraft.spider.selector;
package us.codecraft.webmagic.selector;
import org.apache.commons.collections.CollectionUtils;

@ -1,4 +1,4 @@
package us.codecraft.spider.selector;
package us.codecraft.webmagic.selector;
/**
* User: cairne

@ -1,4 +1,4 @@
package us.codecraft.spider.selector;
package us.codecraft.webmagic.selector;
import org.apache.commons.lang3.StringUtils;

@ -1,4 +1,4 @@
package us.codecraft.spider.selector;
package us.codecraft.webmagic.selector;
import java.util.List;
import java.util.regex.Matcher;

@ -1,4 +1,4 @@
package us.codecraft.spider.selector;
package us.codecraft.webmagic.selector;
import java.util.List;

@ -1,4 +1,4 @@
package us.codecraft.spider.selector;
package us.codecraft.webmagic.selector;
import java.util.List;

@ -1,4 +1,4 @@
package us.codecraft.spider.selector;
package us.codecraft.webmagic.selector;
import org.apache.commons.lang3.StringUtils;

@ -1,4 +1,4 @@
package us.codecraft.spider.selector;
package us.codecraft.webmagic.selector;
import org.apache.log4j.Logger;
import org.htmlcleaner.HtmlCleaner;

@ -1,4 +1,4 @@
package us.codecraft.spider.selector;
package us.codecraft.webmagic.selector;
import org.htmlcleaner.*;

@ -1,4 +1,4 @@
package us.codecraft.spider.utils;
package us.codecraft.webmagic.utils;
import org.apache.commons.lang3.StringUtils;

@ -1,8 +1,8 @@
package us.codecraft.spider;
package us.codecraft.webmagic;
import org.junit.Assert;
import org.junit.Test;
import us.codecraft.spider.selector.Html;
import us.codecraft.webmagic.selector.Html;
/**
* User: cairne

@ -1,11 +1,11 @@
package us.codecraft.spider;
package us.codecraft.webmagic;
import org.junit.Ignore;
import org.junit.Test;
import us.codecraft.spider.pipeline.FilePipeline;
import us.codecraft.spider.processor.SimplePageProcessor;
import us.codecraft.spider.samples.HuxiuProcessor;
import us.codecraft.spider.schedular.FileCacheQueueSchedular;
import us.codecraft.webmagic.pipeline.FilePipeline;
import us.codecraft.webmagic.processor.SimplePageProcessor;
import us.codecraft.webmagic.samples.HuxiuProcessor;
import us.codecraft.webmagic.schedular.FileCacheQueueSchedular;
/**
* User: cairne
@ -24,12 +24,12 @@ public class SpiderTest {
@Test
public void testGlobalSpider(){
// PageProcessor pageProcessor = new MeicanProcessor();
// Spider.me().pipeline(new FilePipeline()).schedular(new FileCacheQueueSchedular(pageProcessor.getSite(),"/data/temp/spider/cache/")).
// Spider.me().pipeline(new FilePipeline()).schedular(new FileCacheQueueSchedular(pageProcessor.getSite(),"/data/temp/webmagic/cache/")).
// processor(pageProcessor).run();
SimplePageProcessor pageProcessor2 = new SimplePageProcessor("http://www.diaoyuweng.com/home.php?mod=space&uid=88304&do=thread&view=me&type=thread&from=space", "http://www.diaoyuweng.com/thread-*-1-1.html");
pageProcessor2.getSite().setEncoding("GBK");
System.out.println(pageProcessor2.getSite().getEncoding());
Spider.me().pipeline(new FilePipeline()).schedular(new FileCacheQueueSchedular(pageProcessor2.getSite(),"/data/temp/spider/cache/")).
Spider.me().pipeline(new FilePipeline()).schedular(new FileCacheQueueSchedular(pageProcessor2.getSite(),"/data/temp/webmagic/cache/")).
processor(pageProcessor2).run();

@ -1,8 +1,8 @@
package us.codecraft.spider.samples;
package us.codecraft.webmagic.samples;
import us.codecraft.spider.Site;
import us.codecraft.spider.Page;
import us.codecraft.spider.processor.PageProcessor;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.processor.PageProcessor;
import java.util.List;

@ -1,8 +1,8 @@
package us.codecraft.spider.samples;
package us.codecraft.webmagic.samples;
import us.codecraft.spider.Site;
import us.codecraft.spider.Page;
import us.codecraft.spider.processor.PageProcessor;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.processor.PageProcessor;
import java.util.List;

@ -1,8 +1,8 @@
package us.codecraft.spider.samples;
package us.codecraft.webmagic.samples;
import us.codecraft.spider.Page;
import us.codecraft.spider.Site;
import us.codecraft.spider.processor.PageProcessor;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.processor.PageProcessor;
import java.util.List;

@ -1,8 +1,8 @@
package us.codecraft.spider.samples;
package us.codecraft.webmagic.samples;
import us.codecraft.spider.Site;
import us.codecraft.spider.Page;
import us.codecraft.spider.processor.PageProcessor;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.processor.PageProcessor;
import java.util.List;

@ -1,8 +1,8 @@
package us.codecraft.spider.samples;
package us.codecraft.webmagic.samples;
import us.codecraft.spider.Site;
import us.codecraft.spider.Page;
import us.codecraft.spider.processor.PageProcessor;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.processor.PageProcessor;
import java.util.List;

@ -1,8 +1,8 @@
package us.codecraft.spider.samples;
package us.codecraft.webmagic.samples;
import us.codecraft.spider.Page;
import us.codecraft.spider.Site;
import us.codecraft.spider.processor.PageProcessor;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.processor.PageProcessor;
/**
* User: cairne

@ -1,8 +1,8 @@
package us.codecraft.spider.samples;
package us.codecraft.webmagic.samples;
import us.codecraft.spider.Page;
import us.codecraft.spider.Site;
import us.codecraft.spider.processor.PageProcessor;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.processor.PageProcessor;
import java.util.List;

@ -1,8 +1,8 @@
package us.codecraft.spider.samples;
package us.codecraft.webmagic.samples;
import us.codecraft.spider.Site;
import us.codecraft.spider.Page;
import us.codecraft.spider.processor.PageProcessor;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.processor.PageProcessor;
import java.util.List;

@ -1,8 +1,8 @@
package us.codecraft.spider.samples;
package us.codecraft.webmagic.samples;
import us.codecraft.spider.Site;
import us.codecraft.spider.Page;
import us.codecraft.spider.processor.PageProcessor;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.processor.PageProcessor;
import java.util.List;

@ -1,8 +1,8 @@
package us.codecraft.spider.samples;
package us.codecraft.webmagic.samples;
import us.codecraft.spider.Site;
import us.codecraft.spider.Page;
import us.codecraft.spider.processor.PageProcessor;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.processor.PageProcessor;
import java.util.List;

@ -1,8 +1,8 @@
package us.codecraft.spider.samples;
package us.codecraft.webmagic.samples;
import us.codecraft.spider.Page;
import us.codecraft.spider.Site;
import us.codecraft.spider.processor.PageProcessor;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.processor.PageProcessor;
import java.util.List;

@ -1,8 +1,8 @@
package us.codecraft.spider.samples;
package us.codecraft.webmagic.samples;
import us.codecraft.spider.Site;
import us.codecraft.spider.Page;
import us.codecraft.spider.processor.PageProcessor;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.processor.PageProcessor;
/**
* User: cairne

@ -1,8 +1,8 @@
package us.codecraft.spider.samples;
package us.codecraft.webmagic.samples;
import us.codecraft.spider.Site;
import us.codecraft.spider.Page;
import us.codecraft.spider.processor.PageProcessor;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.processor.PageProcessor;
import java.util.List;

@ -1,4 +1,4 @@
package us.codecraft.spider.selector;
package us.codecraft.webmagic.selector;
import org.htmlcleaner.CleanerProperties;
import org.htmlcleaner.HtmlCleaner;
@ -6,7 +6,6 @@ import org.htmlcleaner.TagNode;
import org.junit.Test;
import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URL;
/**

@ -1,4 +1,4 @@
package us.codecraft.spider.selector;
package us.codecraft.webmagic.selector;
import junit.framework.Assert;
import org.junit.Test;

@ -1,12 +1,8 @@
package us.codecraft.spider.selector;
package us.codecraft.webmagic.selector;
import org.htmlcleaner.CleanerProperties;
import org.htmlcleaner.HtmlCleaner;
import org.htmlcleaner.TagNode;
import org.junit.Test;
import java.io.IOException;
import java.net.URL;
/**
* User: cairne

@ -1,4 +1,4 @@
package us.codecraft.spider.selector;
package us.codecraft.webmagic.selector;
import org.junit.Assert;
import org.junit.Test;

@ -1,4 +1,4 @@
package us.codecraft.spider.utils;
package us.codecraft.webmagic.utils;
import org.junit.Assert;
import org.junit.Test;

@ -0,0 +1,26 @@
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE log4j:configuration SYSTEM "log4j.dtd">
<log4j:configuration xmlns:log4j="http://jakarta.apache.org/log4j/">
<appender name="stdout" class="org.apache.log4j.ConsoleAppender">
<layout class="org.apache.log4j.PatternLayout">
<param name="ConversionPattern" value="%d{yy-MM-dd HH:mm:ss,SSS} %-5p %c(%F:%L) ## %m%n" />
</layout>
</appender>
<logger name="org.springframework" additivity="false">
<level value="warn" />
<appender-ref ref="stdout" />
</logger>
<logger name="net.sf.ehcache" additivity="false">
<level value="warn" />
<appender-ref ref="stdout" />
</logger>
<root>
<level value="info" />
<appender-ref ref="stdout" />
</root>
</log4j:configuration>

@ -0,0 +1,31 @@
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE log4j:configuration SYSTEM "log4j.dtd">
<log4j:configuration xmlns:log4j="http://jakarta.apache.org/log4j/">
<appender name="stdout" class="org.apache.log4j.ConsoleAppender">
<layout class="org.apache.log4j.PatternLayout">
<param name="ConversionPattern" value="%d{yy-MM-dd HH:mm:ss,SSS} %-5p %c(%F:%L) ## %m%n" />
</layout>
</appender>
<logger name="org.springframework" additivity="false">
<level value="warn" />
<appender-ref ref="stdout" />
</logger>
<logger name="org.apache" additivity="false">
<level value="warn" />
<appender-ref ref="stdout" />
</logger>
<logger name="net.sf.ehcache" additivity="false">
<level value="warn" />
<appender-ref ref="stdout" />
</logger>
<root>
<level value="debug" />
<appender-ref ref="stdout" />
</root>
</log4j:configuration>

@ -0,0 +1,67 @@
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<groupId>us.codecraft</groupId>
<version>0.0.1-SNAPSHOT</version>
<modelVersion>4.0.0</modelVersion>
<artifactId>webmagic-plugin</artifactId>
<dependencies>
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-core</artifactId>
<version>0.0.1-SNAPSHOT</version>
</dependency>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>4.7</version>
<scope>test</scope>
</dependency>
</dependencies>
<build>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-resources-plugin</artifactId>
<configuration>
<encoding>UTF-8</encoding>
</configuration>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-source-plugin</artifactId>
<executions>
<execution>
<id>attach-sources</id>
<goals>
<goal>jar</goal>
</goals>
</execution>
</executions>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-javadoc-plugin</artifactId>
<executions>
<execution>
<id>attach-javadocs</id>
<goals>
<goal>jar</goal>
</goals>
</execution>
</executions>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-release-plugin</artifactId>
<version>2.0-beta-7</version>
</plugin>
</plugins>
</build>
</project>

Some files were not shown because too many files have changed in this diff Show More

Loading…
Cancel
Save