update lucene pipeline

pull/17/head
yihua.huang 12 years ago
parent 29f8cd2ec6
commit 7d277e84d4

@ -228,8 +228,10 @@ public class Spider implements Runnable, Task {
} }
pageProcessor.process(page); pageProcessor.process(page);
addRequest(page); addRequest(page);
for (Pipeline pipeline : pipelines) { if (!page.getResultItems().isSkip()){
pipeline.process(page.getResultItems(), this); for (Pipeline pipeline : pipelines) {
pipeline.process(page.getResultItems(), this);
}
} }
sleep(site.getSleepTime()); sleep(site.getSleepTime());
} }

@ -15,9 +15,6 @@ public class ConsolePipeline implements Pipeline{
@Override @Override
public void process(ResultItems resultItems,Task task) { public void process(ResultItems resultItems,Task task) {
if (resultItems.isSkip()){
return;
}
System.out.println("get page: "+resultItems.getRequest().getUrl()); System.out.println("get page: "+resultItems.getRequest().getUrl());
for (Map.Entry<String, Object> entry : resultItems.getAll().entrySet()) { for (Map.Entry<String, Object> entry : resultItems.getAll().entrySet()) {
if (entry.getValue() instanceof Iterable) { if (entry.getValue() instanceof Iterable) {

@ -47,9 +47,6 @@ public class FilePipeline implements Pipeline {
if (!file.exists()) { if (!file.exists()) {
file.mkdirs(); file.mkdirs();
} }
if (resultItems.isSkip()) {
return;
}
try { try {
PrintWriter printWriter = new PrintWriter(new FileWriter(path + DigestUtils.md5Hex(resultItems.getRequest().getUrl()) + ".html")); PrintWriter printWriter = new PrintWriter(new FileWriter(path + DigestUtils.md5Hex(resultItems.getRequest().getUrl()) + ".html"));
printWriter.println("url:\t" + resultItems.getRequest().getUrl()); printWriter.println("url:\t" + resultItems.getRequest().getUrl());

@ -3,20 +3,26 @@ package us.codecraft.webmagic.pipeline;
import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document; import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.queryparser.classic.ParseException;
import org.apache.lucene.queryparser.classic.QueryParser; import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query; import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.store.Directory; import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory; import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.util.Version; import org.apache.lucene.util.Version;
import us.codecraft.webmagic.ResultItems; import us.codecraft.webmagic.ResultItems;
import us.codecraft.webmagic.Task; import us.codecraft.webmagic.Task;
import java.io.File; import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
/** /**
* @author yihua.huang@dianping.com <br> * @author yihua.huang@dianping.com <br>
@ -24,41 +30,64 @@ import java.io.File;
* Time: 2:11 <br> * Time: 2:11 <br>
*/ */
public class LucenePipeline implements Pipeline { public class LucenePipeline implements Pipeline {
@Override
public void process(ResultItems resultItems, Task task) {
try {
} catch (Exception e) { private Directory directory;
} private IndexWriter indexWriter;
}
private Analyzer analyzer;
public static void main(String[] args) throws Exception { private void init() throws IOException {
Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_44); analyzer = new StandardAnalyzer(Version.LUCENE_44);
// Directory directory = new RAMDirectory(); directory = new RAMDirectory();
// To store an index on disk, use this instead:
Directory directory = FSDirectory.open(new File("/data/webmagic/www.guoxue123.cn/"));
IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_44, analyzer); IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_44, analyzer);
IndexWriter iwriter = new IndexWriter(directory, config); indexWriter = new IndexWriter(directory, config);
Document doc = new Document(); indexWriter.close();
// String text = "This is the text to be indexed."; }
// doc.add(new Field("fieldname", text, TextField.TYPE_STORED));
// iwriter.addDocument(doc); public LucenePipeline() {
iwriter.close(); try {
init();
} catch (IOException e) {
e.printStackTrace();
}
}
// Now search the index: public List<Document> search(String fieldName, String value) throws IOException, ParseException {
List<Document> documents = new ArrayList<Document>();
DirectoryReader ireader = DirectoryReader.open(directory); DirectoryReader ireader = DirectoryReader.open(directory);
IndexSearcher isearcher = new IndexSearcher(ireader); IndexSearcher isearcher = new IndexSearcher(ireader);
// Parse a simple query that searches for "text": // Parse a simple query that searches for "text":
QueryParser parser = new QueryParser(Version.LUCENE_44, "fieldname", analyzer); QueryParser parser = new QueryParser(Version.LUCENE_44, fieldName, analyzer);
Query query = parser.parse("经典"); Query query = parser.parse(value);
ScoreDoc[] hits = isearcher.search(query, null, 1000).scoreDocs; ScoreDoc[] hits = isearcher.search(query, null, 1000).scoreDocs;
// Iterate through the results: // Iterate through the results:
for (int i = 0; i < hits.length; i++) { for (int i = 0; i < hits.length; i++) {
Document hitDoc = isearcher.doc(hits[i].doc); Document hitDoc = isearcher.doc(hits[i].doc);
System.out.println(hitDoc); documents.add(hitDoc);
} }
ireader.close(); ireader.close();
directory.close(); directory.close();
return documents;
}
@Override
public void process(ResultItems resultItems, Task task) {
if (resultItems.isSkip()){
return;
}
Document doc = new Document();
Map<String,Object> all = resultItems.getAll();
if (all==null){
return;
}
for (Map.Entry<String, Object> objectEntry : all.entrySet()) {
doc.add(new Field(objectEntry.getKey(), objectEntry.getValue().toString(), TextField.TYPE_STORED));
}
try {
indexWriter.addDocument(doc);
} catch (IOException e) {
e.printStackTrace();
}
} }
} }

@ -29,9 +29,6 @@ class ModelPipeline implements Pipeline {
@Override @Override
public void process(ResultItems resultItems, Task task) { public void process(ResultItems resultItems, Task task) {
if (resultItems.isSkip()) {
return;
}
for (Map.Entry<Class, PageModelPipeline> classPageModelPipelineEntry : pageModelPipelines.entrySet()) { for (Map.Entry<Class, PageModelPipeline> classPageModelPipelineEntry : pageModelPipelines.entrySet()) {
Object o = resultItems.get(classPageModelPipelineEntry.getKey().getCanonicalName()); Object o = resultItems.get(classPageModelPipelineEntry.getKey().getCanonicalName());
if (o != null) { if (o != null) {

@ -40,9 +40,6 @@ public class FreemarkerPipeline implements Pipeline {
@Override @Override
public void process(ResultItems resultItems, Task task) { public void process(ResultItems resultItems, Task task) {
if (resultItems.isSkip()) {
return;
}
String path = this.path + "" + task.getUUID() + "/"; String path = this.path + "" + task.getUUID() + "/";
File file = new File(path); File file = new File(path);
if (!file.exists()) { if (!file.exists()) {

Loading…
Cancel
Save