From 7d277e84d46e3f4ac43841aa3e080cf14b6db4fb Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Tue, 6 Aug 2013 21:47:44 +0800 Subject: [PATCH] update lucene pipeline --- .../java/us/codecraft/webmagic/Spider.java | 6 +- .../webmagic/pipeline/ConsolePipeline.java | 3 - .../webmagic/pipeline/FilePipeline.java | 3 - .../webmagic/pipeline/LucenePipeline.java | 75 +++++++++++++------ .../webmagic/model/ModelPipeline.java | 3 - .../webmagic/pipeline/FreemarkerPipeline.java | 3 - 6 files changed, 56 insertions(+), 37 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java index 414315c2..878c63e3 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java @@ -228,8 +228,10 @@ public class Spider implements Runnable, Task { } pageProcessor.process(page); addRequest(page); - for (Pipeline pipeline : pipelines) { - pipeline.process(page.getResultItems(), this); + if (!page.getResultItems().isSkip()){ + for (Pipeline pipeline : pipelines) { + pipeline.process(page.getResultItems(), this); + } } sleep(site.getSleepTime()); } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/ConsolePipeline.java b/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/ConsolePipeline.java index 8f294745..2ff99c87 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/ConsolePipeline.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/ConsolePipeline.java @@ -15,9 +15,6 @@ public class ConsolePipeline implements Pipeline{ @Override public void process(ResultItems resultItems,Task task) { - if (resultItems.isSkip()){ - return; - } System.out.println("get page: "+resultItems.getRequest().getUrl()); for (Map.Entry entry : resultItems.getAll().entrySet()) { if (entry.getValue() instanceof Iterable) { diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/FilePipeline.java b/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/FilePipeline.java index 01f8d8b8..39248d24 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/FilePipeline.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/FilePipeline.java @@ -47,9 +47,6 @@ public class FilePipeline implements Pipeline { if (!file.exists()) { file.mkdirs(); } - if (resultItems.isSkip()) { - return; - } try { PrintWriter printWriter = new PrintWriter(new FileWriter(path + DigestUtils.md5Hex(resultItems.getRequest().getUrl()) + ".html")); printWriter.println("url:\t" + resultItems.getRequest().getUrl()); diff --git a/webmagic-plugin/webmagic-lucene/src/main/java/us/codecraft/webmagic/pipeline/LucenePipeline.java b/webmagic-plugin/webmagic-lucene/src/main/java/us/codecraft/webmagic/pipeline/LucenePipeline.java index 2e7191c5..aca6501f 100644 --- a/webmagic-plugin/webmagic-lucene/src/main/java/us/codecraft/webmagic/pipeline/LucenePipeline.java +++ b/webmagic-plugin/webmagic-lucene/src/main/java/us/codecraft/webmagic/pipeline/LucenePipeline.java @@ -3,20 +3,26 @@ package us.codecraft.webmagic.pipeline; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.document.TextField; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; +import org.apache.lucene.queryparser.classic.ParseException; import org.apache.lucene.queryparser.classic.QueryParser; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.store.Directory; -import org.apache.lucene.store.FSDirectory; +import org.apache.lucene.store.RAMDirectory; import org.apache.lucene.util.Version; import us.codecraft.webmagic.ResultItems; import us.codecraft.webmagic.Task; -import java.io.File; +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; +import java.util.Map; /** * @author yihua.huang@dianping.com
@@ -24,41 +30,64 @@ import java.io.File; * Time: 下午2:11
*/ public class LucenePipeline implements Pipeline { - @Override - public void process(ResultItems resultItems, Task task) { - try { - } catch (Exception e) { + private Directory directory; - } - } + private IndexWriter indexWriter; + + private Analyzer analyzer; - public static void main(String[] args) throws Exception { - Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_44); -// Directory directory = new RAMDirectory(); - // To store an index on disk, use this instead: - Directory directory = FSDirectory.open(new File("/data/webmagic/www.guoxue123.cn/")); + private void init() throws IOException { + analyzer = new StandardAnalyzer(Version.LUCENE_44); + directory = new RAMDirectory(); IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_44, analyzer); - IndexWriter iwriter = new IndexWriter(directory, config); - Document doc = new Document(); -// String text = "This is the text to be indexed."; -// doc.add(new Field("fieldname", text, TextField.TYPE_STORED)); -// iwriter.addDocument(doc); - iwriter.close(); + indexWriter = new IndexWriter(directory, config); + indexWriter.close(); + } + + public LucenePipeline() { + try { + init(); + } catch (IOException e) { + e.printStackTrace(); + } + } - // Now search the index: + public List search(String fieldName, String value) throws IOException, ParseException { + List documents = new ArrayList(); DirectoryReader ireader = DirectoryReader.open(directory); IndexSearcher isearcher = new IndexSearcher(ireader); // Parse a simple query that searches for "text": - QueryParser parser = new QueryParser(Version.LUCENE_44, "fieldname", analyzer); - Query query = parser.parse("经典"); + QueryParser parser = new QueryParser(Version.LUCENE_44, fieldName, analyzer); + Query query = parser.parse(value); ScoreDoc[] hits = isearcher.search(query, null, 1000).scoreDocs; // Iterate through the results: for (int i = 0; i < hits.length; i++) { Document hitDoc = isearcher.doc(hits[i].doc); - System.out.println(hitDoc); + documents.add(hitDoc); } ireader.close(); directory.close(); + return documents; + } + + @Override + public void process(ResultItems resultItems, Task task) { + if (resultItems.isSkip()){ + return; + } + Document doc = new Document(); + Map all = resultItems.getAll(); + if (all==null){ + return; + } + for (Map.Entry objectEntry : all.entrySet()) { + doc.add(new Field(objectEntry.getKey(), objectEntry.getValue().toString(), TextField.TYPE_STORED)); + } + try { + indexWriter.addDocument(doc); + } catch (IOException e) { + e.printStackTrace(); + } } } diff --git a/webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/model/ModelPipeline.java b/webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/model/ModelPipeline.java index f9b0015a..c9f67dce 100644 --- a/webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/model/ModelPipeline.java +++ b/webmagic-plugin/webmagic-misc/src/main/java/us/codecraft/webmagic/model/ModelPipeline.java @@ -29,9 +29,6 @@ class ModelPipeline implements Pipeline { @Override public void process(ResultItems resultItems, Task task) { - if (resultItems.isSkip()) { - return; - } for (Map.Entry classPageModelPipelineEntry : pageModelPipelines.entrySet()) { Object o = resultItems.get(classPageModelPipelineEntry.getKey().getCanonicalName()); if (o != null) { diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/pipeline/FreemarkerPipeline.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/pipeline/FreemarkerPipeline.java index 9a045eff..37420628 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/pipeline/FreemarkerPipeline.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/pipeline/FreemarkerPipeline.java @@ -40,9 +40,6 @@ public class FreemarkerPipeline implements Pipeline { @Override public void process(ResultItems resultItems, Task task) { - if (resultItems.isSkip()) { - return; - } String path = this.path + "" + task.getUUID() + "/"; File file = new File(path); if (!file.exists()) {