add uuid to spider

pull/17/head
yihua.huang 12 years ago
parent 6428e20543
commit 49a4ad66d3

@ -12,11 +12,6 @@ public class Site {
private String domain;
/**
* for identify a task
*/
private String identifier;
private String userAgent;
private Map<String, String> cookies = new LinkedHashMap<String, String>();
@ -66,15 +61,6 @@ public class Site {
return this;
}
public String getIdentifier() {
return identifier;
}
public Site setIdentifier(String identifier) {
this.identifier = identifier;
return this;
}
public String getEncoding() {
return encoding;
}
@ -97,7 +83,7 @@ public class Site {
return startUrls;
}
public Site setStartUrl(String startUrl) {
public Site addStartUrl(String startUrl) {
this.startUrls.add(startUrl);
return this;
}

@ -18,7 +18,7 @@ import java.util.List;
* Date: 13-4-21
* Time: 6:53
*/
public class Spider implements Runnable {
public class Spider implements Runnable, Task {
private Downloader downloader = new HttpClientDownloader();
@ -26,6 +26,12 @@ public class Spider implements Runnable {
private PageProcessor pageProcessor;
private List<String> startUrls;
private Site site;
private String uuid;
private Schedular schedular = new QueueSchedular();
private Logger logger = Logger.getLogger(getClass());
@ -36,9 +42,18 @@ public class Spider implements Runnable {
public Spider processor(PageProcessor pageProcessor) {
this.pageProcessor = pageProcessor;
for (String startUrl : pageProcessor.getSite().getStartUrls()) {
schedular.push(new Request(startUrl), pageProcessor.getSite());
}
this.site = pageProcessor.getSite();
return this;
}
public Spider startUrls(List<String> startUrls) {
this.startUrls = startUrls;
return this;
}
public Spider startUrl(String startUrl) {
startUrls = new ArrayList<String>();
startUrls.add(startUrl);
return this;
}
@ -59,13 +74,15 @@ public class Spider implements Runnable {
@Override
public void run() {
Site site = pageProcessor.getSite();
Request request = schedular.poll(site);
if (pipelines.isEmpty()){
for (String startUrl : pageProcessor.getSite().getStartUrls()) {
schedular.push(new Request(startUrl), this);
}
Request request = schedular.poll(this);
if (pipelines.isEmpty()) {
pipelines.add(new ConsolePipeline());
}
while (request != null) {
Page page = downloader.download(request,site);
Page page = downloader.download(request, site);
if (page == null) {
sleep(site.getSleepTime());
continue;
@ -73,13 +90,19 @@ public class Spider implements Runnable {
pageProcessor.process(page);
addRequest(page);
for (Pipeline pipeline : pipelines) {
pipeline.process(page,site);
pipeline.process(page, this);
}
sleep(site.getSleepTime());
request = schedular.poll(site);
request = schedular.poll(this);
}
}
public Spider setUUID(String uuid) {
this.uuid = uuid;
return this;
}
private void sleep(int time) {
try {
Thread.sleep(time);
@ -91,8 +114,19 @@ public class Spider implements Runnable {
private void addRequest(Page page) {
if (CollectionUtils.isNotEmpty(page.getTargetRequests())) {
for (Request request : page.getTargetRequests()) {
schedular.push(request,pageProcessor.getSite());
schedular.push(request, this);
}
}
}
@Override
public String getUUID() {
if (uuid != null) {
return uuid;
}
if (site != null) {
return site.getDomain();
}
return null;
}
}

@ -0,0 +1,12 @@
package us.codecraft.webmagic;
/**
* Author: code4crafer@gmail.com
* Date: 13-6-18
* Time: 2:57
*/
public interface Task {
public String getUUID();
}

@ -1,7 +1,7 @@
package us.codecraft.webmagic.pipeline;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.selector.Selectable;
import java.util.Map;
@ -14,7 +14,7 @@ import java.util.Map;
public class ConsolePipeline implements Pipeline{
@Override
public void process(Page page,Site site) {
public void process(Page page,Task task) {
System.out.println("get page: "+page.getUrl());
for (Map.Entry<String, Selectable> entry : page.getFields().entrySet()) {
System.out.println(entry.getKey()+":\t"+entry.getValue().toStrings());

@ -2,9 +2,8 @@ package us.codecraft.webmagic.pipeline;
import org.apache.commons.codec.digest.DigestUtils;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.selector.Selectable;
import us.codecraft.webmagic.utils.UrlUtils;
import java.io.File;
import java.io.FileWriter;
@ -30,10 +29,8 @@ public class FilePipeline implements Pipeline {
}
@Override
public void process(Page page, Site site) {
String domain = site.getDomain();
domain = UrlUtils.getDomain(domain);
String path = this.path + "" + domain + "#" + site.getIdentifier() + "/";
public void process(Page page, Task task) {
String path = this.path + "/" + task.getUUID() + "/";
File file = new File(path);
if (!file.exists()) {
file.mkdirs();

@ -1,7 +1,7 @@
package us.codecraft.webmagic.pipeline;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Task;
/**
* Author: code4crafter@gmail.com
@ -10,5 +10,5 @@ import us.codecraft.webmagic.Site;
*/
public interface Pipeline {
public void process(Page page,Site site);
public void process(Page page,Task task);
}

@ -20,7 +20,7 @@ public class SimplePageProcessor implements PageProcessor {
private Site site;
public SimplePageProcessor(String startUrl, String urlPattern) {
this.site = Site.me().setStartUrl(startUrl).
this.site = Site.me().addStartUrl(startUrl).
setDomain(UrlUtils.getDomain(startUrl)).setUserAgent(UA);
this.urlPattern = "("+urlPattern.replace(".","\\.").replace("*","[^\"'#]*")+")";

@ -2,8 +2,8 @@ package us.codecraft.webmagic.schedular;
import org.apache.commons.lang3.math.NumberUtils;
import org.apache.log4j.Logger;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Task;
import java.io.*;
import java.util.LinkedHashSet;
@ -28,7 +28,7 @@ public class FileCacheQueueSchedular implements Schedular {
private String fileUrlAllName = ".urls.txt";
private Site site;
private Task task;
private String fileCursor = ".cursor.txt";
@ -44,13 +44,13 @@ public class FileCacheQueueSchedular implements Schedular {
private Set<String> urls;
public FileCacheQueueSchedular(Site site) {
this.site = site;
public FileCacheQueueSchedular(Task task) {
this.task = task;
}
public FileCacheQueueSchedular(Site site, String filePath) {
public FileCacheQueueSchedular(Task task, String filePath) {
this.filePath = filePath;
this.site = site;
this.task = task;
}
private void flush() {
@ -106,7 +106,7 @@ public class FileCacheQueueSchedular implements Schedular {
urls.add(line.trim());
lineReaded++;
if (lineReaded > cursor.get()) {
queue.add(new Request(line, site));
queue.add(new Request(line));
}
}
}
@ -121,11 +121,11 @@ public class FileCacheQueueSchedular implements Schedular {
}
private String getFileName(String filename) {
return filePath + site.getDomain() + "#" + site.getIdentifier() + filename;
return filePath + task.getUUID() + "/" + filename;
}
@Override
public synchronized void push(Request request, Site site) {
public synchronized void push(Request request, Task task) {
if (!inited.get()) {
init();
}
@ -140,7 +140,7 @@ public class FileCacheQueueSchedular implements Schedular {
}
@Override
public synchronized Request poll(Site site) {
public synchronized Request poll(Task task) {
if (!inited.get()) {
init();
}

@ -2,7 +2,7 @@ package us.codecraft.webmagic.schedular;
import org.apache.log4j.Logger;
import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Task;
import java.util.HashSet;
import java.util.Set;
@ -23,7 +23,7 @@ public class QueueSchedular implements Schedular {
private Set<String> urls = new HashSet<String>();
@Override
public synchronized void push(Request request,Site site) {
public synchronized void push(Request request,Task task) {
if (logger.isDebugEnabled()){
logger.debug("push to queue "+request.getUrl());
}
@ -34,7 +34,7 @@ public class QueueSchedular implements Schedular {
}
@Override
public synchronized Request poll(Site site) {
public synchronized Request poll(Task task) {
return queue.poll();
}
}

@ -1,7 +1,7 @@
package us.codecraft.webmagic.schedular;
import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Task;
/**
* Author: code4crafter@gmail.com
@ -10,8 +10,8 @@ import us.codecraft.webmagic.Site;
*/
public interface Schedular {
public void push(Request request,Site site);
public void push(Request request,Task task);
public Request poll(Site site);
public Request poll(Task task);
}

@ -37,7 +37,7 @@ public class DiandianBlogProcessor implements PageProcessor {
public Site getSite() {
//site定义抽取配置以及开始url等
if (site == null) {
site = Site.me().setDomain("progressdaily.diandian.com").setStartUrl("http://progressdaily.diandian.com/").
site = Site.me().setDomain("progressdaily.diandian.com").addStartUrl("http://progressdaily.diandian.com/").
setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31");
}
return site;

@ -27,7 +27,7 @@ public class DianpingBlogProcessor implements PageProcessor {
@Override
public Site getSite() {
return Site.me().setDomain("www.dianping.com").setStartUrl("http://www.dianping.com/").
return Site.me().setDomain("www.dianping.com").addStartUrl("http://www.dianping.com/").
setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31");
}
}

@ -33,7 +33,7 @@ public class DiaoyuwengProcessor implements PageProcessor {
@Override
public Site getSite() {
if (site==null){
site= Site.me().setDomain("www.diaoyuweng.com").setStartUrl("http://www.diaoyuweng.com/home.php?mod=space&uid=88304&do=thread&view=me&type=thread&from=space").
site= Site.me().setDomain("www.diaoyuweng.com").addStartUrl("http://www.diaoyuweng.com/home.php?mod=space&uid=88304&do=thread&view=me&type=thread&from=space").
setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31").setEncoding("GBK").setSleepTime(500);
}
return site;

@ -23,6 +23,6 @@ public class F58PageProcesser implements PageProcessor {
@Override
public Site getSite() {
return Site.me().setDomain("sh.58.com").setStartUrl("http://sh.58.com/"); //To change body of implemented methods use File | Settings | File Templates.
return Site.me().setDomain("sh.58.com").addStartUrl("http://sh.58.com/"); //To change body of implemented methods use File | Settings | File Templates.
}
}

@ -23,7 +23,7 @@ public class HuxiuProcessor implements PageProcessor {
@Override
public Site getSite() {
return Site.me().setDomain("www.huxiu.com").setStartUrl("http://www.huxiu.com/").
return Site.me().setDomain("www.huxiu.com").addStartUrl("http://www.huxiu.com/").
setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31");
}
}

@ -21,7 +21,7 @@ public class KaichibaProcessor implements PageProcessor {
@Override
public Site getSite() {
return Site.me().setDomain("kaichiba.com").setStartUrl("http://kaichiba.com/shop/41725781").setEncoding("utf-8").
return Site.me().setDomain("kaichiba.com").addStartUrl("http://kaichiba.com/shop/41725781").setEncoding("utf-8").
setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31");
}
}

@ -27,7 +27,7 @@ public class MeicanProcessor implements PageProcessor {
@Override
public Site getSite() {
return Site.me().setDomain("meican.com").setStartUrl("http://www.meican.com/shanghai/districts").setEncoding("utf-8").
return Site.me().setDomain("meican.com").addStartUrl("http://www.meican.com/shanghai/districts").setEncoding("utf-8").
setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31");
}
}

@ -22,7 +22,7 @@ public class NjuBBSProcessor implements PageProcessor {
@Override
public Site getSite() {
return Site.me().setDomain("bbs.nju.edu.cn").setStartUrl("http://bbs.nju.edu.cn/board?board=Pictures").
return Site.me().setDomain("bbs.nju.edu.cn").addStartUrl("http://bbs.nju.edu.cn/board?board=Pictures").
setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31");
}
}

@ -24,7 +24,7 @@ public class OschinaBlogPageProcesser implements PageProcessor {
@Override
public Site getSite() {
return Site.me().setDomain("my.oschina.net").setStartUrl("http://www.oschina.net/").
return Site.me().setDomain("my.oschina.net").addStartUrl("http://www.oschina.net/").
setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31");
}
}

@ -23,7 +23,7 @@ public class OschinaPageProcesser implements PageProcessor {
@Override
public Site getSite() {
return Site.me().setDomain("www.oschina.net").setStartUrl("http://www.oschina.net/").
return Site.me().setDomain("www.oschina.net").addStartUrl("http://www.oschina.net/").
setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31");
}
}

@ -26,7 +26,7 @@ public class QzoneBlogProcessor implements PageProcessor {
@Override
public Site getSite() {
return Site.me().setDomain("www.diandian.com").setStartUrl("http://17dujingdian.com/").
return Site.me().setDomain("www.diandian.com").addStartUrl("http://17dujingdian.com/").
setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31");
}
}

@ -26,7 +26,7 @@ public class SinaBlogProcesser implements PageProcessor {
@Override
public Site getSite() {
if (site==null){
site = Site.me().setDomain("blog.sina.com.cn").setStartUrl("http://blog.sina.com.cn/flashsword20").setSleepTime(3000).
site = Site.me().setDomain("blog.sina.com.cn").addStartUrl("http://blog.sina.com.cn/flashsword20").setSleepTime(3000).
setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31");
}
return site;

@ -23,6 +23,6 @@ public class TianyaPageProcesser implements PageProcessor {
@Override
public Site getSite() {
return Site.me().setDomain("http://bbs.tianya.cn/").setStartUrl("http://bbs.tianya.cn/"); //To change body of implemented methods use File | Settings | File Templates.
return Site.me().setDomain("http://bbs.tianya.cn/").addStartUrl("http://bbs.tianya.cn/"); //To change body of implemented methods use File | Settings | File Templates.
}
}

@ -1,13 +1,13 @@
<item>
<title>${title}</title>
<link>http://127.0.0.1/wordpress/?p=${id}</link>
<link>http://127.0.0.1/wordpress/?p=${uuid}</link>
<pubDate>${date}</pubDate>
<dc:creator>admin</dc:creator>
<guid isPermaLink="false">http://127.0.0.1/wordpress/?p=${id}</guid>
<guid isPermaLink="false">http://127.0.0.1/wordpress/?p=${uuid}</guid>
<description></description>
<content:encoded><![CDATA[${content}]]></content:encoded>
<excerpt:encoded><![CDATA[]]></excerpt:encoded>
<wp:post_id>${id}</wp:post_id>
<wp:post_id>${uuid}</wp:post_id>
<wp:post_date>${date}</wp:post_date>
<wp:post_date_gmt>${date}</wp:post_date_gmt>
<wp:comment_status>open</wp:comment_status>

Loading…
Cancel
Save