diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java index 674ac5bc..a4d88d89 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java @@ -12,11 +12,6 @@ public class Site { private String domain; - /** - * for identify a task - */ - private String identifier; - private String userAgent; private Map cookies = new LinkedHashMap(); @@ -66,15 +61,6 @@ public class Site { return this; } - public String getIdentifier() { - return identifier; - } - - public Site setIdentifier(String identifier) { - this.identifier = identifier; - return this; - } - public String getEncoding() { return encoding; } @@ -97,7 +83,7 @@ public class Site { return startUrls; } - public Site setStartUrl(String startUrl) { + public Site addStartUrl(String startUrl) { this.startUrls.add(startUrl); return this; } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java index 7f34850d..f7f560cb 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java @@ -18,7 +18,7 @@ import java.util.List; * Date: 13-4-21 * Time: 上午6:53 */ -public class Spider implements Runnable { +public class Spider implements Runnable, Task { private Downloader downloader = new HttpClientDownloader(); @@ -26,6 +26,12 @@ public class Spider implements Runnable { private PageProcessor pageProcessor; + private List startUrls; + + private Site site; + + private String uuid; + private Schedular schedular = new QueueSchedular(); private Logger logger = Logger.getLogger(getClass()); @@ -36,9 +42,18 @@ public class Spider implements Runnable { public Spider processor(PageProcessor pageProcessor) { this.pageProcessor = pageProcessor; - for (String startUrl : pageProcessor.getSite().getStartUrls()) { - schedular.push(new Request(startUrl), pageProcessor.getSite()); - } + this.site = pageProcessor.getSite(); + return this; + } + + public Spider startUrls(List startUrls) { + this.startUrls = startUrls; + return this; + } + + public Spider startUrl(String startUrl) { + startUrls = new ArrayList(); + startUrls.add(startUrl); return this; } @@ -59,13 +74,15 @@ public class Spider implements Runnable { @Override public void run() { - Site site = pageProcessor.getSite(); - Request request = schedular.poll(site); - if (pipelines.isEmpty()){ + for (String startUrl : pageProcessor.getSite().getStartUrls()) { + schedular.push(new Request(startUrl), this); + } + Request request = schedular.poll(this); + if (pipelines.isEmpty()) { pipelines.add(new ConsolePipeline()); } while (request != null) { - Page page = downloader.download(request,site); + Page page = downloader.download(request, site); if (page == null) { sleep(site.getSleepTime()); continue; @@ -73,13 +90,19 @@ public class Spider implements Runnable { pageProcessor.process(page); addRequest(page); for (Pipeline pipeline : pipelines) { - pipeline.process(page,site); + pipeline.process(page, this); } sleep(site.getSleepTime()); - request = schedular.poll(site); + request = schedular.poll(this); } } + public Spider setUUID(String uuid) { + this.uuid = uuid; + return this; + } + + private void sleep(int time) { try { Thread.sleep(time); @@ -91,8 +114,19 @@ public class Spider implements Runnable { private void addRequest(Page page) { if (CollectionUtils.isNotEmpty(page.getTargetRequests())) { for (Request request : page.getTargetRequests()) { - schedular.push(request,pageProcessor.getSite()); + schedular.push(request, this); } } } + + @Override + public String getUUID() { + if (uuid != null) { + return uuid; + } + if (site != null) { + return site.getDomain(); + } + return null; + } } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Task.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Task.java new file mode 100644 index 00000000..0eaf6c95 --- /dev/null +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Task.java @@ -0,0 +1,12 @@ +package us.codecraft.webmagic; + +/** + * Author: code4crafer@gmail.com + * Date: 13-6-18 + * Time: 下午2:57 + */ +public interface Task { + + public String getUUID(); + +} diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/ConsolePipeline.java b/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/ConsolePipeline.java index 635bab62..7b93876d 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/ConsolePipeline.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/ConsolePipeline.java @@ -1,7 +1,7 @@ package us.codecraft.webmagic.pipeline; import us.codecraft.webmagic.Page; -import us.codecraft.webmagic.Site; +import us.codecraft.webmagic.Task; import us.codecraft.webmagic.selector.Selectable; import java.util.Map; @@ -14,7 +14,7 @@ import java.util.Map; public class ConsolePipeline implements Pipeline{ @Override - public void process(Page page,Site site) { + public void process(Page page,Task task) { System.out.println("get page: "+page.getUrl()); for (Map.Entry entry : page.getFields().entrySet()) { System.out.println(entry.getKey()+":\t"+entry.getValue().toStrings()); diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/FilePipeline.java b/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/FilePipeline.java index 84a94ce7..2311a75d 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/FilePipeline.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/FilePipeline.java @@ -2,9 +2,8 @@ package us.codecraft.webmagic.pipeline; import org.apache.commons.codec.digest.DigestUtils; import us.codecraft.webmagic.Page; -import us.codecraft.webmagic.Site; +import us.codecraft.webmagic.Task; import us.codecraft.webmagic.selector.Selectable; -import us.codecraft.webmagic.utils.UrlUtils; import java.io.File; import java.io.FileWriter; @@ -30,10 +29,8 @@ public class FilePipeline implements Pipeline { } @Override - public void process(Page page, Site site) { - String domain = site.getDomain(); - domain = UrlUtils.getDomain(domain); - String path = this.path + "" + domain + "#" + site.getIdentifier() + "/"; + public void process(Page page, Task task) { + String path = this.path + "/" + task.getUUID() + "/"; File file = new File(path); if (!file.exists()) { file.mkdirs(); diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/Pipeline.java b/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/Pipeline.java index e5da1ea8..b2b51e0a 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/Pipeline.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/Pipeline.java @@ -1,7 +1,7 @@ package us.codecraft.webmagic.pipeline; import us.codecraft.webmagic.Page; -import us.codecraft.webmagic.Site; +import us.codecraft.webmagic.Task; /** * Author: code4crafter@gmail.com @@ -10,5 +10,5 @@ import us.codecraft.webmagic.Site; */ public interface Pipeline { - public void process(Page page,Site site); + public void process(Page page,Task task); } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/processor/SimplePageProcessor.java b/webmagic-core/src/main/java/us/codecraft/webmagic/processor/SimplePageProcessor.java index c5d63771..9f4eed36 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/processor/SimplePageProcessor.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/processor/SimplePageProcessor.java @@ -20,7 +20,7 @@ public class SimplePageProcessor implements PageProcessor { private Site site; public SimplePageProcessor(String startUrl, String urlPattern) { - this.site = Site.me().setStartUrl(startUrl). + this.site = Site.me().addStartUrl(startUrl). setDomain(UrlUtils.getDomain(startUrl)).setUserAgent(UA); this.urlPattern = "("+urlPattern.replace(".","\\.").replace("*","[^\"'#]*")+")"; diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/schedular/FileCacheQueueSchedular.java b/webmagic-core/src/main/java/us/codecraft/webmagic/schedular/FileCacheQueueSchedular.java index b3086a28..e9d4adb7 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/schedular/FileCacheQueueSchedular.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/schedular/FileCacheQueueSchedular.java @@ -2,8 +2,8 @@ package us.codecraft.webmagic.schedular; import org.apache.commons.lang3.math.NumberUtils; import org.apache.log4j.Logger; -import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Request; +import us.codecraft.webmagic.Task; import java.io.*; import java.util.LinkedHashSet; @@ -28,7 +28,7 @@ public class FileCacheQueueSchedular implements Schedular { private String fileUrlAllName = ".urls.txt"; - private Site site; + private Task task; private String fileCursor = ".cursor.txt"; @@ -44,13 +44,13 @@ public class FileCacheQueueSchedular implements Schedular { private Set urls; - public FileCacheQueueSchedular(Site site) { - this.site = site; + public FileCacheQueueSchedular(Task task) { + this.task = task; } - public FileCacheQueueSchedular(Site site, String filePath) { + public FileCacheQueueSchedular(Task task, String filePath) { this.filePath = filePath; - this.site = site; + this.task = task; } private void flush() { @@ -106,7 +106,7 @@ public class FileCacheQueueSchedular implements Schedular { urls.add(line.trim()); lineReaded++; if (lineReaded > cursor.get()) { - queue.add(new Request(line, site)); + queue.add(new Request(line)); } } } @@ -121,11 +121,11 @@ public class FileCacheQueueSchedular implements Schedular { } private String getFileName(String filename) { - return filePath + site.getDomain() + "#" + site.getIdentifier() + filename; + return filePath + task.getUUID() + "/" + filename; } @Override - public synchronized void push(Request request, Site site) { + public synchronized void push(Request request, Task task) { if (!inited.get()) { init(); } @@ -140,7 +140,7 @@ public class FileCacheQueueSchedular implements Schedular { } @Override - public synchronized Request poll(Site site) { + public synchronized Request poll(Task task) { if (!inited.get()) { init(); } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/schedular/QueueSchedular.java b/webmagic-core/src/main/java/us/codecraft/webmagic/schedular/QueueSchedular.java index b9c39c32..8c3da3b0 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/schedular/QueueSchedular.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/schedular/QueueSchedular.java @@ -2,7 +2,7 @@ package us.codecraft.webmagic.schedular; import org.apache.log4j.Logger; import us.codecraft.webmagic.Request; -import us.codecraft.webmagic.Site; +import us.codecraft.webmagic.Task; import java.util.HashSet; import java.util.Set; @@ -23,7 +23,7 @@ public class QueueSchedular implements Schedular { private Set urls = new HashSet(); @Override - public synchronized void push(Request request,Site site) { + public synchronized void push(Request request,Task task) { if (logger.isDebugEnabled()){ logger.debug("push to queue "+request.getUrl()); } @@ -34,7 +34,7 @@ public class QueueSchedular implements Schedular { } @Override - public synchronized Request poll(Site site) { + public synchronized Request poll(Task task) { return queue.poll(); } } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/schedular/Schedular.java b/webmagic-core/src/main/java/us/codecraft/webmagic/schedular/Schedular.java index 965ad258..8e4edb42 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/schedular/Schedular.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/schedular/Schedular.java @@ -1,7 +1,7 @@ package us.codecraft.webmagic.schedular; import us.codecraft.webmagic.Request; -import us.codecraft.webmagic.Site; +import us.codecraft.webmagic.Task; /** * Author: code4crafter@gmail.com @@ -10,8 +10,8 @@ import us.codecraft.webmagic.Site; */ public interface Schedular { - public void push(Request request,Site site); + public void push(Request request,Task task); - public Request poll(Site site); + public Request poll(Task task); } diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DiandianBlogProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DiandianBlogProcessor.java index b17b05db..a9351a1b 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DiandianBlogProcessor.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DiandianBlogProcessor.java @@ -37,7 +37,7 @@ public class DiandianBlogProcessor implements PageProcessor { public Site getSite() { //site定义抽取配置,以及开始url等 if (site == null) { - site = Site.me().setDomain("progressdaily.diandian.com").setStartUrl("http://progressdaily.diandian.com/"). + site = Site.me().setDomain("progressdaily.diandian.com").addStartUrl("http://progressdaily.diandian.com/"). setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31"); } return site; diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DianpingBlogProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DianpingBlogProcessor.java index cc91f89a..fafb7de2 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DianpingBlogProcessor.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DianpingBlogProcessor.java @@ -27,7 +27,7 @@ public class DianpingBlogProcessor implements PageProcessor { @Override public Site getSite() { - return Site.me().setDomain("www.dianping.com").setStartUrl("http://www.dianping.com/"). + return Site.me().setDomain("www.dianping.com").addStartUrl("http://www.dianping.com/"). setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31"); } } diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DiaoyuwengProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DiaoyuwengProcessor.java index bcea9a31..8d64bbca 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DiaoyuwengProcessor.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DiaoyuwengProcessor.java @@ -33,7 +33,7 @@ public class DiaoyuwengProcessor implements PageProcessor { @Override public Site getSite() { if (site==null){ - site= Site.me().setDomain("www.diaoyuweng.com").setStartUrl("http://www.diaoyuweng.com/home.php?mod=space&uid=88304&do=thread&view=me&type=thread&from=space"). + site= Site.me().setDomain("www.diaoyuweng.com").addStartUrl("http://www.diaoyuweng.com/home.php?mod=space&uid=88304&do=thread&view=me&type=thread&from=space"). setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31").setEncoding("GBK").setSleepTime(500); } return site; diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/F58PageProcesser.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/F58PageProcesser.java index 346b1e22..82db2dd7 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/F58PageProcesser.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/F58PageProcesser.java @@ -23,6 +23,6 @@ public class F58PageProcesser implements PageProcessor { @Override public Site getSite() { - return Site.me().setDomain("sh.58.com").setStartUrl("http://sh.58.com/"); //To change body of implemented methods use File | Settings | File Templates. + return Site.me().setDomain("sh.58.com").addStartUrl("http://sh.58.com/"); //To change body of implemented methods use File | Settings | File Templates. } } diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/HuxiuProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/HuxiuProcessor.java index 8286d781..36f69466 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/HuxiuProcessor.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/HuxiuProcessor.java @@ -23,7 +23,7 @@ public class HuxiuProcessor implements PageProcessor { @Override public Site getSite() { - return Site.me().setDomain("www.huxiu.com").setStartUrl("http://www.huxiu.com/"). + return Site.me().setDomain("www.huxiu.com").addStartUrl("http://www.huxiu.com/"). setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31"); } } diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/KaichibaProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/KaichibaProcessor.java index f2d40506..11f04627 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/KaichibaProcessor.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/KaichibaProcessor.java @@ -21,7 +21,7 @@ public class KaichibaProcessor implements PageProcessor { @Override public Site getSite() { - return Site.me().setDomain("kaichiba.com").setStartUrl("http://kaichiba.com/shop/41725781").setEncoding("utf-8"). + return Site.me().setDomain("kaichiba.com").addStartUrl("http://kaichiba.com/shop/41725781").setEncoding("utf-8"). setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31"); } } diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/MeicanProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/MeicanProcessor.java index 30ba84d4..aaeca8f8 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/MeicanProcessor.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/MeicanProcessor.java @@ -27,7 +27,7 @@ public class MeicanProcessor implements PageProcessor { @Override public Site getSite() { - return Site.me().setDomain("meican.com").setStartUrl("http://www.meican.com/shanghai/districts").setEncoding("utf-8"). + return Site.me().setDomain("meican.com").addStartUrl("http://www.meican.com/shanghai/districts").setEncoding("utf-8"). setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31"); } } diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/NjuBBSProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/NjuBBSProcessor.java index 3b61d764..58e19c4e 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/NjuBBSProcessor.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/NjuBBSProcessor.java @@ -22,7 +22,7 @@ public class NjuBBSProcessor implements PageProcessor { @Override public Site getSite() { - return Site.me().setDomain("bbs.nju.edu.cn").setStartUrl("http://bbs.nju.edu.cn/board?board=Pictures"). + return Site.me().setDomain("bbs.nju.edu.cn").addStartUrl("http://bbs.nju.edu.cn/board?board=Pictures"). setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31"); } } diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/OschinaBlogPageProcesser.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/OschinaBlogPageProcesser.java index 58d87a7f..bcc2d6ee 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/OschinaBlogPageProcesser.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/OschinaBlogPageProcesser.java @@ -24,7 +24,7 @@ public class OschinaBlogPageProcesser implements PageProcessor { @Override public Site getSite() { - return Site.me().setDomain("my.oschina.net").setStartUrl("http://www.oschina.net/"). + return Site.me().setDomain("my.oschina.net").addStartUrl("http://www.oschina.net/"). setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31"); } } diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/OschinaPageProcesser.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/OschinaPageProcesser.java index 4f84a3f5..d85ca814 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/OschinaPageProcesser.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/OschinaPageProcesser.java @@ -23,7 +23,7 @@ public class OschinaPageProcesser implements PageProcessor { @Override public Site getSite() { - return Site.me().setDomain("www.oschina.net").setStartUrl("http://www.oschina.net/"). + return Site.me().setDomain("www.oschina.net").addStartUrl("http://www.oschina.net/"). setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31"); } } diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/QzoneBlogProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/QzoneBlogProcessor.java index e252eeff..fac491df 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/QzoneBlogProcessor.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/QzoneBlogProcessor.java @@ -26,7 +26,7 @@ public class QzoneBlogProcessor implements PageProcessor { @Override public Site getSite() { - return Site.me().setDomain("www.diandian.com").setStartUrl("http://17dujingdian.com/"). + return Site.me().setDomain("www.diandian.com").addStartUrl("http://17dujingdian.com/"). setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31"); } } diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/SinaBlogProcesser.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/SinaBlogProcesser.java index 07f6d476..37c68649 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/SinaBlogProcesser.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/SinaBlogProcesser.java @@ -26,7 +26,7 @@ public class SinaBlogProcesser implements PageProcessor { @Override public Site getSite() { if (site==null){ - site = Site.me().setDomain("blog.sina.com.cn").setStartUrl("http://blog.sina.com.cn/flashsword20").setSleepTime(3000). + site = Site.me().setDomain("blog.sina.com.cn").addStartUrl("http://blog.sina.com.cn/flashsword20").setSleepTime(3000). setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31"); } return site; diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/TianyaPageProcesser.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/TianyaPageProcesser.java index 564f1efb..db5f9ff2 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/TianyaPageProcesser.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/TianyaPageProcesser.java @@ -23,6 +23,6 @@ public class TianyaPageProcesser implements PageProcessor { @Override public Site getSite() { - return Site.me().setDomain("http://bbs.tianya.cn/").setStartUrl("http://bbs.tianya.cn/"); //To change body of implemented methods use File | Settings | File Templates. + return Site.me().setDomain("http://bbs.tianya.cn/").addStartUrl("http://bbs.tianya.cn/"); //To change body of implemented methods use File | Settings | File Templates. } } diff --git a/webmagic-samples/src/main/resources/ftl/wordpress.ftl b/webmagic-samples/src/main/resources/ftl/wordpress.ftl index f2feeb16..c2442ab6 100644 --- a/webmagic-samples/src/main/resources/ftl/wordpress.ftl +++ b/webmagic-samples/src/main/resources/ftl/wordpress.ftl @@ -1,13 +1,13 @@ ${title} - http://127.0.0.1/wordpress/?p=${id} + http://127.0.0.1/wordpress/?p=${uuid} ${date} admin - http://127.0.0.1/wordpress/?p=${id} + http://127.0.0.1/wordpress/?p=${uuid} - ${id} + ${uuid} ${date} ${date} open