Clean project structure #70

pull/79/head
yihua.huang 11 years ago
parent 9606a173cd
commit 6c11718566

@ -6,7 +6,7 @@
<version>7</version>
</parent>
<groupId>us.codecraft</groupId>
<version>0.4.4-SNAPSHOT</version>
<version>0.5.0-SNAPSHOT</version>
<modelVersion>4.0.0</modelVersion>
<packaging>pom</packaging>
<properties>
@ -51,11 +51,11 @@
<module>webmagic-core</module>
<module>webmagic-extension/</module>
<module>webmagic-scripts/</module>
<module>webmagic-avalon</module>
<module>webmagic-lucene</module>
<module>webmagic-samples</module>
<module>webmagic-saxon</module>
<module>webmagic-selenium</module>
<module>webmagic-saxon</module>
<module>webmagic-samples</module>
<module>webmagic-admin</module>
<module>webmagic-worker</module>
</modules>
<dependencyManagement>
@ -63,7 +63,7 @@
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>4.7</version>
<version>4.11</version>
<scope>test</scope>
</dependency>
<dependency>
@ -91,11 +91,6 @@
<artifactId>xsoup</artifactId>
<version>0.2.0</version>
</dependency>
<dependency>
<groupId>net.sf.saxon</groupId>
<artifactId>Saxon-HE</artifactId>
<version>9.5.1-1</version>
</dependency>
<dependency>
<groupId>com.alibaba</groupId>
<artifactId>fastjson</artifactId>
@ -121,11 +116,6 @@
<artifactId>commons-collections</artifactId>
<version>3.2.1</version>
</dependency>
<dependency>
<groupId>net.sourceforge.htmlcleaner</groupId>
<artifactId>htmlcleaner</artifactId>
<version>2.5</version>
</dependency>
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-io</artifactId>

@ -0,0 +1,3 @@
WebMagic-Admin
=====
Admin is the control web of workers.

@ -3,12 +3,12 @@
<parent>
<artifactId>webmagic-parent</artifactId>
<groupId>us.codecraft</groupId>
<version>0.4.4-SNAPSHOT</version>
<version>0.5.0-SNAPSHOT</version>
</parent>
<modelVersion>4.0.0</modelVersion>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-avalon</artifactId>
<artifactId>webmagic-admin</artifactId>
<packaging>war</packaging>
<dependencies>

Before

Width:  |  Height:  |  Size: 8.5 KiB

After

Width:  |  Height:  |  Size: 8.5 KiB

@ -3,7 +3,7 @@
<parent>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-parent</artifactId>
<version>0.4.4-SNAPSHOT</version>
<version>0.5.0-SNAPSHOT</version>
</parent>
<modelVersion>4.0.0</modelVersion>
@ -50,11 +50,6 @@
<artifactId>commons-collections</artifactId>
</dependency>
<dependency>
<groupId>net.sourceforge.htmlcleaner</groupId>
<artifactId>htmlcleaner</artifactId>
</dependency>
<dependency>
<groupId>org.assertj</groupId>
<artifactId>assertj-core</artifactId>

@ -13,7 +13,6 @@ import us.codecraft.webmagic.pipeline.ResultItemsCollectorPipeline;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.scheduler.QueueScheduler;
import us.codecraft.webmagic.scheduler.Scheduler;
import us.codecraft.webmagic.utils.EnvironmentUtil;
import us.codecraft.webmagic.utils.ThreadUtils;
import us.codecraft.webmagic.utils.UrlUtils;
@ -541,15 +540,6 @@ public class Spider implements Runnable, Task {
return this;
}
/**
* switch off xsoup
*
* @return
*/
public static void xsoupOff() {
EnvironmentUtil.setUseXsoup(false);
}
public boolean isExitWhenComplete() {
return exitWhenComplete;
}

@ -4,7 +4,6 @@ import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import us.codecraft.webmagic.utils.EnvironmentUtil;
import java.util.ArrayList;
import java.util.List;
@ -96,16 +95,11 @@ public class Html extends PlainText {
@Override
public Selectable xpath(String xpath) {
if (EnvironmentUtil.useXsoup()) {
XsoupSelector xsoupSelector = new XsoupSelector(xpath);
if (document != null) {
return new Html(xsoupSelector.selectList(document));
}
return selectList(xsoupSelector, strings);
} else {
XpathSelector xpathSelector = new XpathSelector(xpath);
return selectList(xpathSelector, strings);
XpathSelector xpathSelector = new XpathSelector(xpath);
if (document != null) {
return new Html(xpathSelector.selectList(document));
}
return selectList(xpathSelector, strings);
}
@Override

@ -32,8 +32,12 @@ public abstract class Selectors {
return new XpathSelector(expr);
}
public static XsoupSelector xsoup(String expr) {
return new XsoupSelector(expr);
/**
* @Deprecated
* @see #xpath(String)
*/
public static XpathSelector xsoup(String expr) {
return new XpathSelector(expr);
}
public static AndSelector and(Selector... selectors) {

@ -1,70 +1,32 @@
package us.codecraft.webmagic.selector;
import org.htmlcleaner.*;
import org.jsoup.nodes.Element;
import us.codecraft.xsoup.XPathEvaluator;
import us.codecraft.xsoup.Xsoup;
import java.util.ArrayList;
import java.util.List;
/**
* XPath selector based on HtmlCleaner.<br>
* XPath selector based on Xsoup.<br>
*
* @author code4crafter@gmail.com <br>
* @since 0.1.0
* @since 0.3.0
*/
public class XpathSelector implements Selector {
public class XpathSelector extends BaseElementSelector {
private String xpathStr;
private XPathEvaluator xPathEvaluator;
public XpathSelector(String xpathStr) {
this.xpathStr = xpathStr;
this.xPathEvaluator = Xsoup.compile(xpathStr);
}
@Override
public String select(String text) {
HtmlCleaner htmlCleaner = new HtmlCleaner();
TagNode tagNode = htmlCleaner.clean(text);
if (tagNode == null) {
return null;
}
try {
Object[] objects = tagNode.evaluateXPath(xpathStr);
if (objects != null && objects.length >= 1) {
if (objects[0] instanceof TagNode) {
TagNode tagNode1 = (TagNode) objects[0];
return htmlCleaner.getInnerHtml(tagNode1);
} else {
return objects[0].toString();
}
}
} catch (XPatherException e) {
e.printStackTrace();
}
return null;
public String select(Element element) {
return xPathEvaluator.evaluate(element).get();
}
@Override
public List<String> selectList(String text) {
HtmlCleaner htmlCleaner = new HtmlCleaner();
TagNode tagNode = htmlCleaner.clean(text);
if (tagNode == null) {
return null;
}
List<String> results = new ArrayList<String>();
try {
Object[] objects = tagNode.evaluateXPath(xpathStr);
if (objects != null && objects.length >= 1) {
for (Object object : objects) {
if (object instanceof TagNode) {
TagNode tagNode1 = (TagNode) object;
results.add(htmlCleaner.getInnerHtml(tagNode1));
} else {
results.add(object.toString());
}
}
}
} catch (XPatherException e) {
e.printStackTrace();
}
return results;
public List<String> selectList(Element element) {
return xPathEvaluator.evaluate(element).list();
}
}

@ -1,32 +0,0 @@
package us.codecraft.webmagic.selector;
import org.jsoup.nodes.Element;
import us.codecraft.xsoup.XPathEvaluator;
import us.codecraft.xsoup.Xsoup;
import java.util.List;
/**
* XPath selector based on Xsoup.<br>
*
* @author code4crafter@gmail.com <br>
* @since 0.3.0
*/
public class XsoupSelector extends BaseElementSelector {
private XPathEvaluator xPathEvaluator;
public XsoupSelector(String xpathStr) {
this.xPathEvaluator = Xsoup.compile(xpathStr);
}
@Override
public String select(Element element) {
return xPathEvaluator.evaluate(element).get();
}
@Override
public List<String> selectList(Element element) {
return xPathEvaluator.evaluate(element).list();
}
}

@ -1,28 +0,0 @@
package us.codecraft.webmagic.utils;
import org.apache.commons.lang3.BooleanUtils;
import java.util.Properties;
/**
* @author code4crafter@gmail.com
* @since 0.3.0
*/
public abstract class EnvironmentUtil {
private static final String USE_XSOUP = "xsoup";
public static boolean useXsoup() {
Properties properties = System.getProperties();
Object o = properties.get(USE_XSOUP);
if (o == null) {
return true;
}
return BooleanUtils.toBoolean(((String) o).toLowerCase());
}
public static void setUseXsoup(boolean useXsoup) {
Properties properties = System.getProperties();
properties.setProperty(USE_XSOUP, BooleanUtils.toString(useXsoup, "true", "false"));
}
}

@ -29,6 +29,6 @@ public class ExtractorsTest {
Assert.assertEquals("bb", and($("title"), regex("aa(bb)cc")).select(html2));
OrSelector or = or($("div h1 a", "innerHtml"), xpath("//title"));
Assert.assertEquals("aabbcc", or.select(html));
Assert.assertEquals("aabbcc", or.select(html2));
Assert.assertEquals("<title>aabbcc</title>", or.select(html2));
}
}

@ -1,18 +0,0 @@
package us.codecraft.webmagic.utils;
import org.junit.Test;
import static junit.framework.Assert.*;
/**
* @author code4crafter@gmail.com
*/
public class EnvironmentUtilTest {
@Test
public void test() {
assertTrue(EnvironmentUtil.useXsoup());
EnvironmentUtil.setUseXsoup(false);
assertFalse(EnvironmentUtil.useXsoup());
}
}

@ -3,7 +3,7 @@
<parent>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-parent</artifactId>
<version>0.4.4-SNAPSHOT</version>
<version>0.5.0-SNAPSHOT</version>
</parent>
<modelVersion>4.0.0</modelVersion>

@ -37,12 +37,7 @@ public class ExtractorUtils {
}
private static Selector getXpathSelector(String value) {
Selector selector;
if (EnvironmentUtil.useXsoup()) {
selector = new XsoupSelector(value);
} else {
selector = new XpathSelector(value);
}
Selector selector = new XpathSelector(value);
return selector;
}

@ -1,3 +0,0 @@
webmagic-lucene
--------
尝试将webmagic与lucene结合打造一个搜索引擎。开发中不作为webmagic主要模块。

@ -1,46 +0,0 @@
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<parent>
<artifactId>webmagic-parent</artifactId>
<groupId>us.codecraft</groupId>
<version>0.4.4-SNAPSHOT</version>
</parent>
<modelVersion>4.0.0</modelVersion>
<artifactId>webmagic-lucene</artifactId>
<dependencies>
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-analyzers-common</artifactId>
<version>4.4.0</version>
</dependency>
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-queryparser</artifactId>
<version>4.4.0</version>
</dependency>
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-extension</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
</dependency>
</dependencies>
<build>
<plugins>
<plugin>
<artifactId>maven-deploy-plugin</artifactId>
<configuration>
<skip>true</skip>
</configuration>
</plugin>
</plugins>
</build>
</project>

@ -1,92 +0,0 @@
package us.codecraft.webmagic.pipeline;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.queryparser.classic.ParseException;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.util.Version;
import us.codecraft.webmagic.ResultItems;
import us.codecraft.webmagic.Task;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
/**
* @author code4crafter@gmail.com <br>
* Date: 13-8-5 <br>
* Time: 2:11 <br>
*/
public class LucenePipeline implements Pipeline {
private Directory directory;
private Analyzer analyzer;
private IndexWriterConfig config;
private void init() throws IOException {
analyzer = new StandardAnalyzer(Version.LUCENE_44);
directory = new RAMDirectory();
config = new IndexWriterConfig(Version.LUCENE_44, analyzer);
}
public LucenePipeline() {
try {
init();
} catch (IOException e) {
e.printStackTrace();
}
}
public List<Document> search(String fieldName, String value) throws IOException, ParseException {
List<Document> documents = new ArrayList<Document>();
DirectoryReader ireader = DirectoryReader.open(directory);
IndexSearcher isearcher = new IndexSearcher(ireader);
// Parse a simple query that searches for "text":
QueryParser parser = new QueryParser(Version.LUCENE_44, fieldName, analyzer);
Query query = parser.parse(value);
ScoreDoc[] hits = isearcher.search(query, null, 1000).scoreDocs;
// Iterate through the results:
for (int i = 0; i < hits.length; i++) {
Document hitDoc = isearcher.doc(hits[i].doc);
documents.add(hitDoc);
}
ireader.close();
return documents;
}
@Override
public void process(ResultItems resultItems, Task task) {
if (resultItems.isSkip()){
return;
}
Document doc = new Document();
Map<String,Object> all = resultItems.getAll();
if (all==null){
return;
}
for (Map.Entry<String, Object> objectEntry : all.entrySet()) {
doc.add(new Field(objectEntry.getKey(), objectEntry.getValue().toString(), TextField.TYPE_STORED));
}
try {
IndexWriter indexWriter = new IndexWriter(directory, config);
indexWriter.addDocument(doc);
indexWriter.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}

@ -1,61 +0,0 @@
package us.codecraft.webmagic.lucene;
import org.apache.lucene.document.Document;
import org.apache.lucene.queryparser.classic.ParseException;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.model.annotation.ExtractBy;
import us.codecraft.webmagic.model.OOSpider;
import us.codecraft.webmagic.model.annotation.TargetUrl;
import us.codecraft.webmagic.pipeline.LucenePipeline;
import java.io.IOException;
import java.util.List;
/**
* @author code4crafter@gmail.com <br>
* Date: 13-8-2 <br>
* Time: 7:52 <br>
*/
@TargetUrl("http://my.oschina.net/flashsword/blog/\\d+")
public class OschinaBlog {
@ExtractBy("//title")
private String title;
@ExtractBy(value = "div.BlogContent", type = ExtractBy.Type.Css)
private String content;
@Override
public String toString() {
return "OschinaBlog{" +
"title='" + title + '\'' +
", content='" + content + '\'' +
'}';
}
public static void main(String[] args) {
LucenePipeline pipeline = new LucenePipeline();
OOSpider.create(Site.me().addStartUrl("http://my.oschina.net/flashsword/blog"), OschinaBlog.class).pipeline(pipeline).runAsync();
while (true) {
try {
List<Document> search = pipeline.search("title", "webmagic");
System.out.println(search);
Thread.sleep(3000);
} catch (IOException e) {
e.printStackTrace();
} catch (ParseException e) {
e.printStackTrace();
} catch (InterruptedException e) {
e.printStackTrace();
}
}
}
public String getTitle() {
return title;
}
public String getContent() {
return content;
}
}

@ -1,20 +0,0 @@
Worker:
任务执行者提供Http接口监控运行状态终止和开始job
队列:
仍然使用redis
Panel:
提供Web管理后台管理
1. 新建任务
1. 通过脚本
2. 配置
3. 分配机器
2. 已有任务
3. 任务查看

@ -1,35 +0,0 @@
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<parent>
<artifactId>webmagic-parent</artifactId>
<groupId>us.codecraft</groupId>
<version>0.4.3-SNAPSHOT</version>
</parent>
<modelVersion>4.0.0</modelVersion>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-panel</artifactId>
<dependencies>
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-scripts</artifactId>
<version>${project.version}</version>
</dependency>
</dependencies>
<build>
<plugins>
<plugin>
<artifactId>maven-deploy-plugin</artifactId>
<configuration>
<skip>true</skip>
</configuration>
</plugin>
</plugins>
</build>
</project>

@ -3,7 +3,7 @@
<parent>
<artifactId>webmagic-parent</artifactId>
<groupId>us.codecraft</groupId>
<version>0.4.4-SNAPSHOT</version>
<version>0.5.0-SNAPSHOT</version>
</parent>
<modelVersion>4.0.0</modelVersion>

@ -3,7 +3,7 @@
<parent>
<artifactId>webmagic-parent</artifactId>
<groupId>us.codecraft</groupId>
<version>0.4.4-SNAPSHOT</version>
<version>0.5.0-SNAPSHOT</version>
</parent>
<modelVersion>4.0.0</modelVersion>
@ -15,9 +15,15 @@
<artifactId>webmagic-core</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>net.sourceforge.htmlcleaner</groupId>
<artifactId>htmlcleaner</artifactId>
<version>2.5</version>
</dependency>
<dependency>
<groupId>net.sf.saxon</groupId>
<artifactId>Saxon-HE</artifactId>
<version>9.5.1-1</version>
</dependency>
<dependency>
<groupId>junit</groupId>

Some files were not shown because too many files have changed in this diff Show More

Loading…
Cancel
Save