From d56c681be1945379e117b8c61be8baec3cb4fcf6 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Mon, 5 Aug 2013 18:08:28 +0800 Subject: [PATCH] add priority to request --- .../java/us/codecraft/webmagic/Request.java | 27 ++++++++ webmagic-plugin/pom.xml | 1 + webmagic-plugin/webmagic-lucene/pom.xml | 28 ++++++++ .../webmagic/pipeline/LucenePipeline.java | 64 +++++++++++++++++++ 4 files changed, 120 insertions(+) create mode 100644 webmagic-plugin/webmagic-lucene/pom.xml create mode 100644 webmagic-plugin/webmagic-lucene/src/main/java/us/codecraft/webmagic/pipeline/LucenePipeline.java diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java index 42dd079f..1f6657ce 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Request.java @@ -28,6 +28,8 @@ public class Request { private Object[] extra; + private double priority; + /** * 构建一个request对象 * @param url 必须参数,待抓取的url @@ -38,6 +40,15 @@ public class Request { this.extra = extra; } + public double getPriority() { + return priority; + } + + public Request setPriority(double priority) { + this.priority = priority; + return this; + } + /** * 获取预存的对象 * @return object[] 预存的对象数组 @@ -54,4 +65,20 @@ public class Request { return url; } + @Override + public boolean equals(Object o) { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + + Request request = (Request) o; + + if (!url.equals(request.url)) return false; + + return true; + } + + @Override + public int hashCode() { + return url.hashCode(); + } } diff --git a/webmagic-plugin/pom.xml b/webmagic-plugin/pom.xml index 22257222..54c69ec5 100644 --- a/webmagic-plugin/pom.xml +++ b/webmagic-plugin/pom.xml @@ -12,6 +12,7 @@ webmagic-misc webmagic-selenium + webmagic-lucene webmagic-plugin diff --git a/webmagic-plugin/webmagic-lucene/pom.xml b/webmagic-plugin/webmagic-lucene/pom.xml new file mode 100644 index 00000000..b072472d --- /dev/null +++ b/webmagic-plugin/webmagic-lucene/pom.xml @@ -0,0 +1,28 @@ + + + + webmagic-plugin + us.codecraft + 0.1.0 + + 4.0.0 + + webmagic-lucene + + + + org.apache.lucene + lucene-analyzers-common + 4.4.0 + + + org.apache.lucene + lucene-queryparser + 4.4.0 + + + + + \ No newline at end of file diff --git a/webmagic-plugin/webmagic-lucene/src/main/java/us/codecraft/webmagic/pipeline/LucenePipeline.java b/webmagic-plugin/webmagic-lucene/src/main/java/us/codecraft/webmagic/pipeline/LucenePipeline.java new file mode 100644 index 00000000..2e7191c5 --- /dev/null +++ b/webmagic-plugin/webmagic-lucene/src/main/java/us/codecraft/webmagic/pipeline/LucenePipeline.java @@ -0,0 +1,64 @@ +package us.codecraft.webmagic.pipeline; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.standard.StandardAnalyzer; +import org.apache.lucene.document.Document; +import org.apache.lucene.index.DirectoryReader; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.IndexWriterConfig; +import org.apache.lucene.queryparser.classic.QueryParser; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.ScoreDoc; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.FSDirectory; +import org.apache.lucene.util.Version; +import us.codecraft.webmagic.ResultItems; +import us.codecraft.webmagic.Task; + +import java.io.File; + +/** + * @author yihua.huang@dianping.com
+ * @date: 13-8-5
+ * Time: 下午2:11
+ */ +public class LucenePipeline implements Pipeline { + @Override + public void process(ResultItems resultItems, Task task) { + try { + + } catch (Exception e) { + + } + } + + public static void main(String[] args) throws Exception { + Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_44); +// Directory directory = new RAMDirectory(); + // To store an index on disk, use this instead: + Directory directory = FSDirectory.open(new File("/data/webmagic/www.guoxue123.cn/")); + IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_44, analyzer); + IndexWriter iwriter = new IndexWriter(directory, config); + Document doc = new Document(); +// String text = "This is the text to be indexed."; +// doc.add(new Field("fieldname", text, TextField.TYPE_STORED)); +// iwriter.addDocument(doc); + iwriter.close(); + + // Now search the index: + DirectoryReader ireader = DirectoryReader.open(directory); + IndexSearcher isearcher = new IndexSearcher(ireader); + // Parse a simple query that searches for "text": + QueryParser parser = new QueryParser(Version.LUCENE_44, "fieldname", analyzer); + Query query = parser.parse("经典"); + ScoreDoc[] hits = isearcher.search(query, null, 1000).scoreDocs; + // Iterate through the results: + for (int i = 0; i < hits.length; i++) { + Document hitDoc = isearcher.doc(hits[i].doc); + System.out.println(hitDoc); + } + ireader.close(); + directory.close(); + } +}