Clean project structure #70
parent
9606a173cd
commit
6c11718566
@ -0,0 +1,3 @@
|
||||
WebMagic-Admin
|
||||
=====
|
||||
Admin is the control web of workers.
|
Before Width: | Height: | Size: 8.5 KiB After Width: | Height: | Size: 8.5 KiB |
@ -1,70 +1,32 @@
|
||||
package us.codecraft.webmagic.selector;
|
||||
|
||||
import org.htmlcleaner.*;
|
||||
import org.jsoup.nodes.Element;
|
||||
import us.codecraft.xsoup.XPathEvaluator;
|
||||
import us.codecraft.xsoup.Xsoup;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* XPath selector based on HtmlCleaner.<br>
|
||||
* XPath selector based on Xsoup.<br>
|
||||
*
|
||||
* @author code4crafter@gmail.com <br>
|
||||
* @since 0.1.0
|
||||
* @since 0.3.0
|
||||
*/
|
||||
public class XpathSelector implements Selector {
|
||||
public class XpathSelector extends BaseElementSelector {
|
||||
|
||||
private String xpathStr;
|
||||
private XPathEvaluator xPathEvaluator;
|
||||
|
||||
public XpathSelector(String xpathStr) {
|
||||
this.xpathStr = xpathStr;
|
||||
this.xPathEvaluator = Xsoup.compile(xpathStr);
|
||||
}
|
||||
|
||||
@Override
|
||||
public String select(String text) {
|
||||
HtmlCleaner htmlCleaner = new HtmlCleaner();
|
||||
TagNode tagNode = htmlCleaner.clean(text);
|
||||
if (tagNode == null) {
|
||||
return null;
|
||||
}
|
||||
try {
|
||||
Object[] objects = tagNode.evaluateXPath(xpathStr);
|
||||
if (objects != null && objects.length >= 1) {
|
||||
if (objects[0] instanceof TagNode) {
|
||||
TagNode tagNode1 = (TagNode) objects[0];
|
||||
return htmlCleaner.getInnerHtml(tagNode1);
|
||||
} else {
|
||||
return objects[0].toString();
|
||||
}
|
||||
}
|
||||
} catch (XPatherException e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
return null;
|
||||
public String select(Element element) {
|
||||
return xPathEvaluator.evaluate(element).get();
|
||||
}
|
||||
|
||||
@Override
|
||||
public List<String> selectList(String text) {
|
||||
HtmlCleaner htmlCleaner = new HtmlCleaner();
|
||||
TagNode tagNode = htmlCleaner.clean(text);
|
||||
if (tagNode == null) {
|
||||
return null;
|
||||
}
|
||||
List<String> results = new ArrayList<String>();
|
||||
try {
|
||||
Object[] objects = tagNode.evaluateXPath(xpathStr);
|
||||
if (objects != null && objects.length >= 1) {
|
||||
for (Object object : objects) {
|
||||
if (object instanceof TagNode) {
|
||||
TagNode tagNode1 = (TagNode) object;
|
||||
results.add(htmlCleaner.getInnerHtml(tagNode1));
|
||||
} else {
|
||||
results.add(object.toString());
|
||||
}
|
||||
}
|
||||
}
|
||||
} catch (XPatherException e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
return results;
|
||||
public List<String> selectList(Element element) {
|
||||
return xPathEvaluator.evaluate(element).list();
|
||||
}
|
||||
}
|
||||
|
@ -1,32 +0,0 @@
|
||||
package us.codecraft.webmagic.selector;
|
||||
|
||||
import org.jsoup.nodes.Element;
|
||||
import us.codecraft.xsoup.XPathEvaluator;
|
||||
import us.codecraft.xsoup.Xsoup;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* XPath selector based on Xsoup.<br>
|
||||
*
|
||||
* @author code4crafter@gmail.com <br>
|
||||
* @since 0.3.0
|
||||
*/
|
||||
public class XsoupSelector extends BaseElementSelector {
|
||||
|
||||
private XPathEvaluator xPathEvaluator;
|
||||
|
||||
public XsoupSelector(String xpathStr) {
|
||||
this.xPathEvaluator = Xsoup.compile(xpathStr);
|
||||
}
|
||||
|
||||
@Override
|
||||
public String select(Element element) {
|
||||
return xPathEvaluator.evaluate(element).get();
|
||||
}
|
||||
|
||||
@Override
|
||||
public List<String> selectList(Element element) {
|
||||
return xPathEvaluator.evaluate(element).list();
|
||||
}
|
||||
}
|
@ -1,28 +0,0 @@
|
||||
package us.codecraft.webmagic.utils;
|
||||
|
||||
import org.apache.commons.lang3.BooleanUtils;
|
||||
|
||||
import java.util.Properties;
|
||||
|
||||
/**
|
||||
* @author code4crafter@gmail.com
|
||||
* @since 0.3.0
|
||||
*/
|
||||
public abstract class EnvironmentUtil {
|
||||
|
||||
private static final String USE_XSOUP = "xsoup";
|
||||
|
||||
public static boolean useXsoup() {
|
||||
Properties properties = System.getProperties();
|
||||
Object o = properties.get(USE_XSOUP);
|
||||
if (o == null) {
|
||||
return true;
|
||||
}
|
||||
return BooleanUtils.toBoolean(((String) o).toLowerCase());
|
||||
}
|
||||
|
||||
public static void setUseXsoup(boolean useXsoup) {
|
||||
Properties properties = System.getProperties();
|
||||
properties.setProperty(USE_XSOUP, BooleanUtils.toString(useXsoup, "true", "false"));
|
||||
}
|
||||
}
|
@ -1,18 +0,0 @@
|
||||
package us.codecraft.webmagic.utils;
|
||||
|
||||
import org.junit.Test;
|
||||
|
||||
import static junit.framework.Assert.*;
|
||||
|
||||
/**
|
||||
* @author code4crafter@gmail.com
|
||||
*/
|
||||
public class EnvironmentUtilTest {
|
||||
|
||||
@Test
|
||||
public void test() {
|
||||
assertTrue(EnvironmentUtil.useXsoup());
|
||||
EnvironmentUtil.setUseXsoup(false);
|
||||
assertFalse(EnvironmentUtil.useXsoup());
|
||||
}
|
||||
}
|
@ -1,46 +0,0 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
|
||||
<parent>
|
||||
<artifactId>webmagic-parent</artifactId>
|
||||
<groupId>us.codecraft</groupId>
|
||||
<version>0.4.4-SNAPSHOT</version>
|
||||
</parent>
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
|
||||
<artifactId>webmagic-lucene</artifactId>
|
||||
|
||||
<dependencies>
|
||||
<dependency>
|
||||
<groupId>org.apache.lucene</groupId>
|
||||
<artifactId>lucene-analyzers-common</artifactId>
|
||||
<version>4.4.0</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.lucene</groupId>
|
||||
<artifactId>lucene-queryparser</artifactId>
|
||||
<version>4.4.0</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>us.codecraft</groupId>
|
||||
<artifactId>webmagic-extension</artifactId>
|
||||
<version>${project.version}</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>junit</groupId>
|
||||
<artifactId>junit</artifactId>
|
||||
</dependency>
|
||||
</dependencies>
|
||||
|
||||
<build>
|
||||
<plugins>
|
||||
<plugin>
|
||||
<artifactId>maven-deploy-plugin</artifactId>
|
||||
<configuration>
|
||||
<skip>true</skip>
|
||||
</configuration>
|
||||
</plugin>
|
||||
</plugins>
|
||||
</build>
|
||||
|
||||
|
||||
</project>
|
@ -1,92 +0,0 @@
|
||||
package us.codecraft.webmagic.pipeline;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.standard.StandardAnalyzer;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.Field;
|
||||
import org.apache.lucene.document.TextField;
|
||||
import org.apache.lucene.index.DirectoryReader;
|
||||
import org.apache.lucene.index.IndexWriter;
|
||||
import org.apache.lucene.index.IndexWriterConfig;
|
||||
import org.apache.lucene.queryparser.classic.ParseException;
|
||||
import org.apache.lucene.queryparser.classic.QueryParser;
|
||||
import org.apache.lucene.search.IndexSearcher;
|
||||
import org.apache.lucene.search.Query;
|
||||
import org.apache.lucene.search.ScoreDoc;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.store.RAMDirectory;
|
||||
import org.apache.lucene.util.Version;
|
||||
import us.codecraft.webmagic.ResultItems;
|
||||
import us.codecraft.webmagic.Task;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* @author code4crafter@gmail.com <br>
|
||||
* Date: 13-8-5 <br>
|
||||
* Time: 下午2:11 <br>
|
||||
*/
|
||||
public class LucenePipeline implements Pipeline {
|
||||
|
||||
private Directory directory;
|
||||
|
||||
private Analyzer analyzer;
|
||||
|
||||
private IndexWriterConfig config;
|
||||
|
||||
private void init() throws IOException {
|
||||
analyzer = new StandardAnalyzer(Version.LUCENE_44);
|
||||
directory = new RAMDirectory();
|
||||
config = new IndexWriterConfig(Version.LUCENE_44, analyzer);
|
||||
}
|
||||
|
||||
public LucenePipeline() {
|
||||
try {
|
||||
init();
|
||||
} catch (IOException e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
}
|
||||
|
||||
public List<Document> search(String fieldName, String value) throws IOException, ParseException {
|
||||
List<Document> documents = new ArrayList<Document>();
|
||||
DirectoryReader ireader = DirectoryReader.open(directory);
|
||||
IndexSearcher isearcher = new IndexSearcher(ireader);
|
||||
// Parse a simple query that searches for "text":
|
||||
QueryParser parser = new QueryParser(Version.LUCENE_44, fieldName, analyzer);
|
||||
Query query = parser.parse(value);
|
||||
ScoreDoc[] hits = isearcher.search(query, null, 1000).scoreDocs;
|
||||
// Iterate through the results:
|
||||
for (int i = 0; i < hits.length; i++) {
|
||||
Document hitDoc = isearcher.doc(hits[i].doc);
|
||||
documents.add(hitDoc);
|
||||
}
|
||||
ireader.close();
|
||||
return documents;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void process(ResultItems resultItems, Task task) {
|
||||
if (resultItems.isSkip()){
|
||||
return;
|
||||
}
|
||||
Document doc = new Document();
|
||||
Map<String,Object> all = resultItems.getAll();
|
||||
if (all==null){
|
||||
return;
|
||||
}
|
||||
for (Map.Entry<String, Object> objectEntry : all.entrySet()) {
|
||||
doc.add(new Field(objectEntry.getKey(), objectEntry.getValue().toString(), TextField.TYPE_STORED));
|
||||
}
|
||||
try {
|
||||
IndexWriter indexWriter = new IndexWriter(directory, config);
|
||||
indexWriter.addDocument(doc);
|
||||
indexWriter.close();
|
||||
} catch (IOException e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
}
|
||||
}
|
@ -1,61 +0,0 @@
|
||||
package us.codecraft.webmagic.lucene;
|
||||
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.queryparser.classic.ParseException;
|
||||
import us.codecraft.webmagic.Site;
|
||||
import us.codecraft.webmagic.model.annotation.ExtractBy;
|
||||
import us.codecraft.webmagic.model.OOSpider;
|
||||
import us.codecraft.webmagic.model.annotation.TargetUrl;
|
||||
import us.codecraft.webmagic.pipeline.LucenePipeline;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* @author code4crafter@gmail.com <br>
|
||||
* Date: 13-8-2 <br>
|
||||
* Time: 上午7:52 <br>
|
||||
*/
|
||||
@TargetUrl("http://my.oschina.net/flashsword/blog/\\d+")
|
||||
public class OschinaBlog {
|
||||
|
||||
@ExtractBy("//title")
|
||||
private String title;
|
||||
|
||||
@ExtractBy(value = "div.BlogContent", type = ExtractBy.Type.Css)
|
||||
private String content;
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "OschinaBlog{" +
|
||||
"title='" + title + '\'' +
|
||||
", content='" + content + '\'' +
|
||||
'}';
|
||||
}
|
||||
|
||||
public static void main(String[] args) {
|
||||
LucenePipeline pipeline = new LucenePipeline();
|
||||
OOSpider.create(Site.me().addStartUrl("http://my.oschina.net/flashsword/blog"), OschinaBlog.class).pipeline(pipeline).runAsync();
|
||||
while (true) {
|
||||
try {
|
||||
List<Document> search = pipeline.search("title", "webmagic");
|
||||
System.out.println(search);
|
||||
Thread.sleep(3000);
|
||||
} catch (IOException e) {
|
||||
e.printStackTrace();
|
||||
} catch (ParseException e) {
|
||||
e.printStackTrace();
|
||||
} catch (InterruptedException e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public String getTitle() {
|
||||
return title;
|
||||
}
|
||||
|
||||
public String getContent() {
|
||||
return content;
|
||||
}
|
||||
}
|
@ -1,35 +0,0 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project xmlns="http://maven.apache.org/POM/4.0.0"
|
||||
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
|
||||
<parent>
|
||||
<artifactId>webmagic-parent</artifactId>
|
||||
<groupId>us.codecraft</groupId>
|
||||
<version>0.4.3-SNAPSHOT</version>
|
||||
</parent>
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
|
||||
<groupId>us.codecraft</groupId>
|
||||
<artifactId>webmagic-panel</artifactId>
|
||||
|
||||
<dependencies>
|
||||
<dependency>
|
||||
<groupId>us.codecraft</groupId>
|
||||
<artifactId>webmagic-scripts</artifactId>
|
||||
<version>${project.version}</version>
|
||||
</dependency>
|
||||
</dependencies>
|
||||
|
||||
<build>
|
||||
<plugins>
|
||||
<plugin>
|
||||
<artifactId>maven-deploy-plugin</artifactId>
|
||||
<configuration>
|
||||
<skip>true</skip>
|
||||
</configuration>
|
||||
</plugin>
|
||||
</plugins>
|
||||
</build>
|
||||
|
||||
|
||||
</project>
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue