From d11f8dac20efcd9b24cc9d06a6ac63270e59b56a Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Sun, 8 Sep 2013 22:58:29 +0800 Subject: [PATCH 01/12] update samples version --- webmagic-samples/pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/webmagic-samples/pom.xml b/webmagic-samples/pom.xml index a42a719a..d4c12be9 100644 --- a/webmagic-samples/pom.xml +++ b/webmagic-samples/pom.xml @@ -5,7 +5,7 @@ webmagic-parent us.codecraft - 0.3.1-SNAPSHOT + 0.3.2-SNAPSHOT 4.0.0 From a0d64b76357a449386755b9867163c91d04a2426 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Thu, 12 Sep 2013 21:06:05 +0800 Subject: [PATCH 02/12] update version --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 5624019f..01a868db 100644 --- a/README.md +++ b/README.md @@ -22,12 +22,12 @@ Add dependencies to your pom.xml: us.codecraft webmagic-core - 0.3.0 + 0.3.1 us.codecraft webmagic-extension - 0.3.0 + 0.3.1 ## Get Started: From 81f75347573f70a39a83afd5d2f7d626b3b305bd Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Thu, 12 Sep 2013 21:28:42 +0800 Subject: [PATCH 03/12] update version --- zh_docs/README.md | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/zh_docs/README.md b/zh_docs/README.md index 0ef0b4d4..9fa22ae3 100644 --- a/zh_docs/README.md +++ b/zh_docs/README.md @@ -34,13 +34,12 @@ webmagic使用maven管理依赖,在项目中添加对应的依赖即可使用w us.codecraft webmagic-core - 0.3.0 + 0.3.1 us.codecraft webmagic-extension - 0.3.0 - + 0.3.1 #### 项目结构 From 3c79d031bd339bbc65d2e78db7011726f0d3d50e Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Sun, 22 Sep 2013 22:52:52 +0800 Subject: [PATCH 04/12] fix thread pool --- .../src/main/java/us/codecraft/webmagic/Spider.java | 11 ++++++----- .../us/codecraft/webmagic/selector/Selectors.java | 5 ----- .../java/us/codecraft/webmagic/utils/UrlUtils.java | 2 -- 3 files changed, 6 insertions(+), 12 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java index 40fb70db..45766f39 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java @@ -234,12 +234,15 @@ public class Spider implements Runnable, Task { } Request request = scheduler.poll(this); //single thread - if (executorService == null) { + if (threadNum <= 1) { while (request != null && stat.compareAndSet(STAT_RUNNING, STAT_RUNNING)) { processRequest(request); request = scheduler.poll(this); } } else { + synchronized (this) { + this.executorService = ThreadUtils.newFixedThreadPool(threadNum); + } //multi thread final AtomicInteger threadAlive = new AtomicInteger(0); while (true && stat.compareAndSet(STAT_RUNNING, STAT_RUNNING)) { @@ -363,10 +366,11 @@ public class Spider implements Runnable, Task { public void stop() { stat.compareAndSet(STAT_RUNNING, STAT_STOPPED); + executorService.shutdown(); } public void stopAndDestroy() { - stat.compareAndSet(STAT_RUNNING, STAT_STOPPED); + stop(); destroy(); } @@ -385,9 +389,6 @@ public class Spider implements Runnable, Task { if (threadNum == 1) { return this; } - synchronized (this) { - this.executorService = ThreadUtils.newFixedThreadPool(threadNum); - } return this; } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectors.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectors.java index 9764641c..0c34eadb 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectors.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectors.java @@ -44,9 +44,4 @@ public abstract class Selectors { return new OrSelector(selectors); } - public static void main(String[] args) { - String s = "a"; - or(regex("(.*)"), xpath("//title"), $("title")).select(s); - } - } \ No newline at end of file diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/utils/UrlUtils.java b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/UrlUtils.java index 4e5f67fc..9ca776d1 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/utils/UrlUtils.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/UrlUtils.java @@ -16,8 +16,6 @@ import java.util.regex.Pattern; */ public class UrlUtils { - private static Pattern relativePathPattern = Pattern.compile("^([\\.]+)/"); - /** * canonicalizeUrl * From fba330872b7ebc211318c66003392e39deb46f80 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Sun, 22 Sep 2013 23:57:15 +0800 Subject: [PATCH 05/12] fix a thread pool exception --- .../java/us/codecraft/webmagic/Spider.java | 779 +++++++++--------- .../codecraft/webmagic/utils/ThreadUtils.java | 27 +- .../us/codecraft/webmagic/SpiderTest.java | 5 +- 3 files changed, 407 insertions(+), 404 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java index 45766f39..829546bb 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java @@ -21,23 +21,28 @@ import java.util.concurrent.atomic.AtomicInteger; /** * Entrance of a crawler.
- * A spider contains four modules: Downloader, Scheduler, PageProcessor and Pipeline.
- * Every module is a field of Spider.
- * The modules are defined in interface.
- * You can customize a spider with various implementations of them.
- * Examples:
+ * A spider contains four modules: Downloader, Scheduler, PageProcessor and + * Pipeline.
+ * Every module is a field of Spider.
+ * The modules are defined in interface.
+ * You can customize a spider with various implementations of them.
+ * Examples:
*
- * A simple crawler:
- * Spider.create(new SimplePageProcessor("http://my.oschina.net/", "http://my.oschina.net/*blog/*")).run();
+ * A simple crawler:
+ * Spider.create(new SimplePageProcessor("http://my.oschina.net/", + * "http://my.oschina.net/*blog/*")).run();
*
- * Store results to files by FilePipeline:
- * Spider.create(new SimplePageProcessor("http://my.oschina.net/", "http://my.oschina.net/*blog/*"))
- * .pipeline(new FilePipeline("/data/temp/webmagic/")).run();
+ * Store results to files by FilePipeline:
+ * Spider.create(new SimplePageProcessor("http://my.oschina.net/", + * "http://my.oschina.net/*blog/*"))
+ * .pipeline(new FilePipeline("/data/temp/webmagic/")).run();
*
- * Use FileCacheQueueScheduler to store urls and cursor in files, so that a Spider can resume the status when shutdown.
- * Spider.create(new SimplePageProcessor("http://my.oschina.net/", "http://my.oschina.net/*blog/*"))
- * .scheduler(new FileCacheQueueScheduler("/data/temp/webmagic/cache/")).run();
- * + * Use FileCacheQueueScheduler to store urls and cursor in files, so that a + * Spider can resume the status when shutdown.
+ * Spider.create(new SimplePageProcessor("http://my.oschina.net/", + * "http://my.oschina.net/*blog/*"))
+ * .scheduler(new FileCacheQueueScheduler("/data/temp/webmagic/cache/")).run();
+ * * @author code4crafter@gmail.com
* @see Downloader * @see Scheduler @@ -47,373 +52,381 @@ import java.util.concurrent.atomic.AtomicInteger; */ public class Spider implements Runnable, Task { - protected Downloader downloader; - - protected List pipelines = new ArrayList(); - - protected PageProcessor pageProcessor; - - protected List startUrls; - - protected Site site; - - protected String uuid; - - protected Scheduler scheduler = new QueueScheduler(); - - protected Logger logger = Logger.getLogger(getClass()); - - protected ExecutorService executorService; - - protected int threadNum = 1; - - protected AtomicInteger stat = new AtomicInteger(STAT_INIT); - - protected final static int STAT_INIT = 0; - - protected final static int STAT_RUNNING = 1; - - protected final static int STAT_STOPPED = 2; - - /** - * create a spider with pageProcessor. - * - * @param pageProcessor - * @return new spider - * @see PageProcessor - */ - public static Spider create(PageProcessor pageProcessor) { - return new Spider(pageProcessor); - } - - /** - * create a spider with pageProcessor. - * - * @param pageProcessor - */ - public Spider(PageProcessor pageProcessor) { - this.pageProcessor = pageProcessor; - this.site = pageProcessor.getSite(); - this.startUrls = pageProcessor.getSite().getStartUrls(); - } - - /** - * Set startUrls of Spider.
- * Prior to startUrls of Site. - * - * @param startUrls - * @return this - */ - public Spider startUrls(List startUrls) { - checkIfRunning(); - this.startUrls = startUrls; - return this; - } - - /** - * Set an uuid for spider.
- * Default uuid is domain of site.
- * - * @param uuid - * @return this - */ - public Spider setUUID(String uuid) { - this.uuid = uuid; - return this; - } - - /** - * set scheduler for Spider - * - * @param scheduler - * @return this - * @Deprecated - * @see #setScheduler(us.codecraft.webmagic.scheduler.Scheduler) - */ - public Spider scheduler(Scheduler scheduler) { - return setScheduler(scheduler); - } - - /** - * set scheduler for Spider - * - * @param scheduler - * @return this - * @see Scheduler - * @since 0.2.1 - */ - public Spider setScheduler(Scheduler scheduler) { - checkIfRunning(); - this.scheduler = scheduler; - return this; - } - - /** - * add a pipeline for Spider - * - * @param pipeline - * @return this - * @see #setPipeline(us.codecraft.webmagic.pipeline.Pipeline) - * @deprecated - */ - public Spider pipeline(Pipeline pipeline) { - return addPipeline(pipeline); - } - - /** - * add a pipeline for Spider - * - * @param pipeline - * @return this - * @see Pipeline - * @since 0.2.1 - */ - public Spider addPipeline(Pipeline pipeline) { - checkIfRunning(); - this.pipelines.add(pipeline); - return this; - } - - /** - * clear the pipelines set - * - * @return this - */ - public Spider clearPipeline() { - pipelines = new ArrayList(); - return this; - } - - /** - * set the downloader of spider - * - * @param downloader - * @return this - * @see #setDownloader(us.codecraft.webmagic.downloader.Downloader) - * @deprecated - */ - public Spider downloader(Downloader downloader) { - return setDownloader(downloader); - } - - /** - * set the downloader of spider - * - * @param downloader - * @return this - * @see Downloader - */ - public Spider setDownloader(Downloader downloader) { - checkIfRunning(); - this.downloader = downloader; - return this; - } - - protected void checkComponent() { - if (downloader == null) { - this.downloader = new HttpClientDownloader(); - } - if (pipelines.isEmpty()) { - pipelines.add(new ConsolePipeline()); - } - downloader.setThread(threadNum); - } - - @Override - public void run() { - if (!stat.compareAndSet(STAT_INIT, STAT_RUNNING) - && !stat.compareAndSet(STAT_STOPPED, STAT_RUNNING)) { - throw new IllegalStateException("Spider is already running!"); - } - checkComponent(); - if (startUrls != null) { - for (String startUrl : startUrls) { - scheduler.push(new Request(startUrl), this); - } - startUrls.clear(); - } - Request request = scheduler.poll(this); - //single thread - if (threadNum <= 1) { - while (request != null && stat.compareAndSet(STAT_RUNNING, STAT_RUNNING)) { - processRequest(request); - request = scheduler.poll(this); - } - } else { - synchronized (this) { - this.executorService = ThreadUtils.newFixedThreadPool(threadNum); - } - //multi thread - final AtomicInteger threadAlive = new AtomicInteger(0); - while (true && stat.compareAndSet(STAT_RUNNING, STAT_RUNNING)) { - if (request == null) { - //when no request found but some thread is alive, sleep a while. - try { - Thread.sleep(100); - } catch (InterruptedException e) { - } - } else { - final Request requestFinal = request; - threadAlive.incrementAndGet(); - executorService.execute(new Runnable() { - @Override - public void run() { - processRequest(requestFinal); - threadAlive.decrementAndGet(); - } - }); - } - request = scheduler.poll(this); - if (threadAlive.get() == 0) { - request = scheduler.poll(this); - if (request == null) { - break; - } - } - } - executorService.shutdown(); - } - stat.compareAndSet(STAT_RUNNING, STAT_STOPPED); - //release some resources - destroy(); - } - - protected void destroy() { - destroyEach(downloader); - destroyEach(pageProcessor); - for (Pipeline pipeline : pipelines) { - destroyEach(pipeline); - } - } - - private void destroyEach(Object object) { - if (object instanceof Closeable) { - try { - ((Closeable) object).close(); - } catch (IOException e) { - e.printStackTrace(); - } - } - } - - /** - * Process specific urls without url discovering. - * - * @param urls urls to process - */ - public void test(String... urls) { - checkComponent(); - if (urls.length > 0) { - for (String url : urls) { - processRequest(new Request(url)); - } - } - } - - protected void processRequest(Request request) { - Page page = downloader.download(request, this); - if (page == null) { - sleep(site.getSleepTime()); - return; - } - //for cycle retry - if (page.getHtml() == null) { - addRequest(page); - sleep(site.getSleepTime()); - return; - } - pageProcessor.process(page); - addRequest(page); - if (!page.getResultItems().isSkip()) { - for (Pipeline pipeline : pipelines) { - pipeline.process(page.getResultItems(), this); - } - } - sleep(site.getSleepTime()); - } - - protected void sleep(int time) { - try { - Thread.sleep(time); - } catch (InterruptedException e) { - e.printStackTrace(); - } - } - - protected void addRequest(Page page) { - if (CollectionUtils.isNotEmpty(page.getTargetRequests())) { - for (Request request : page.getTargetRequests()) { - scheduler.push(request, this); - } - } - } - - protected void checkIfRunning() { - if (!stat.compareAndSet(STAT_INIT, STAT_INIT) && !stat.compareAndSet(STAT_STOPPED, STAT_STOPPED)) { - throw new IllegalStateException("Spider is already running!"); - } - } - - public void runAsync() { - Thread thread = new Thread(this); - thread.setDaemon(false); - thread.start(); - } - - public void start() { - runAsync(); - } - - public void stop() { - stat.compareAndSet(STAT_RUNNING, STAT_STOPPED); - executorService.shutdown(); - } - - public void stopAndDestroy() { - stop(); - destroy(); - } - - /** - * start with more than one threads - * - * @param threadNum - * @return this - */ - public Spider thread(int threadNum) { - checkIfRunning(); - this.threadNum = threadNum; - if (threadNum <= 0) { - throw new IllegalArgumentException("threadNum should be more than one!"); - } - if (threadNum == 1) { - return this; - } - return this; - } - - /** - * switch off xsoup - * - * @return - */ - public static void xsoupOff() { - EnvironmentUtil.setUseXsoup(false); - } - - @Override - public String getUUID() { - if (uuid != null) { - return uuid; - } - if (site != null) { - return site.getDomain(); - } - return null; - } - - @Override - public Site getSite() { - return site; - } + protected Downloader downloader; + + protected List pipelines = new ArrayList(); + + protected PageProcessor pageProcessor; + + protected List startUrls; + + protected Site site; + + protected String uuid; + + protected Scheduler scheduler = new QueueScheduler(); + + protected Logger logger = Logger.getLogger(getClass()); + + protected ExecutorService executorService; + + protected int threadNum = 1; + + protected AtomicInteger stat = new AtomicInteger(STAT_INIT); + + protected final static int STAT_INIT = 0; + + protected final static int STAT_RUNNING = 1; + + protected final static int STAT_STOPPED = 2; + + /** + * create a spider with pageProcessor. + * + * @param pageProcessor + * @return new spider + * @see PageProcessor + */ + public static Spider create(PageProcessor pageProcessor) { + return new Spider(pageProcessor); + } + + /** + * create a spider with pageProcessor. + * + * @param pageProcessor + */ + public Spider(PageProcessor pageProcessor) { + this.pageProcessor = pageProcessor; + this.site = pageProcessor.getSite(); + this.startUrls = pageProcessor.getSite().getStartUrls(); + } + + /** + * Set startUrls of Spider.
+ * Prior to startUrls of Site. + * + * @param startUrls + * @return this + */ + public Spider startUrls(List startUrls) { + checkIfRunning(); + this.startUrls = startUrls; + return this; + } + + /** + * Set an uuid for spider.
+ * Default uuid is domain of site.
+ * + * @param uuid + * @return this + */ + public Spider setUUID(String uuid) { + this.uuid = uuid; + return this; + } + + /** + * set scheduler for Spider + * + * @param scheduler + * @return this + * @Deprecated + * @see #setScheduler(us.codecraft.webmagic.scheduler.Scheduler) + */ + public Spider scheduler(Scheduler scheduler) { + return setScheduler(scheduler); + } + + /** + * set scheduler for Spider + * + * @param scheduler + * @return this + * @see Scheduler + * @since 0.2.1 + */ + public Spider setScheduler(Scheduler scheduler) { + checkIfRunning(); + this.scheduler = scheduler; + return this; + } + + /** + * add a pipeline for Spider + * + * @param pipeline + * @return this + * @see #setPipeline(us.codecraft.webmagic.pipeline.Pipeline) + * @deprecated + */ + public Spider pipeline(Pipeline pipeline) { + return addPipeline(pipeline); + } + + /** + * add a pipeline for Spider + * + * @param pipeline + * @return this + * @see Pipeline + * @since 0.2.1 + */ + public Spider addPipeline(Pipeline pipeline) { + checkIfRunning(); + this.pipelines.add(pipeline); + return this; + } + + /** + * clear the pipelines set + * + * @return this + */ + public Spider clearPipeline() { + pipelines = new ArrayList(); + return this; + } + + /** + * set the downloader of spider + * + * @param downloader + * @return this + * @see #setDownloader(us.codecraft.webmagic.downloader.Downloader) + * @deprecated + */ + public Spider downloader(Downloader downloader) { + return setDownloader(downloader); + } + + /** + * set the downloader of spider + * + * @param downloader + * @return this + * @see Downloader + */ + public Spider setDownloader(Downloader downloader) { + checkIfRunning(); + this.downloader = downloader; + return this; + } + + protected void checkComponent() { + if (downloader == null) { + this.downloader = new HttpClientDownloader(); + } + if (pipelines.isEmpty()) { + pipelines.add(new ConsolePipeline()); + } + downloader.setThread(threadNum); + } + + @Override + public void run() { + if (!stat.compareAndSet(STAT_INIT, STAT_RUNNING) && !stat.compareAndSet(STAT_STOPPED, STAT_RUNNING)) { + throw new IllegalStateException("Spider is already running!"); + } + checkComponent(); + if (startUrls != null) { + for (String startUrl : startUrls) { + scheduler.push(new Request(startUrl), this); + } + startUrls.clear(); + } + Request request = scheduler.poll(this); + logger.info("Spider " + getUUID() + " started!"); + // single thread + if (threadNum <= 1) { + while (request != null && stat.compareAndSet(STAT_RUNNING, STAT_RUNNING)) { + processRequest(request); + request = scheduler.poll(this); + } + } else { + synchronized (this) { + this.executorService = ThreadUtils.newFixedThreadPool(threadNum); + } + // multi thread + final AtomicInteger threadAlive = new AtomicInteger(0); + while (true && stat.compareAndSet(STAT_RUNNING, STAT_RUNNING)) { + if (request == null) { + // when no request found but some thread is alive, sleep a + // while. + try { + Thread.sleep(100); + } catch (InterruptedException e) { + } + } else { + final Request requestFinal = request; + threadAlive.incrementAndGet(); + executorService.execute(new Runnable() { + @Override + public void run() { + processRequest(requestFinal); + threadAlive.decrementAndGet(); + } + }); + } + request = scheduler.poll(this); + if (threadAlive.get() == 0) { + request = scheduler.poll(this); + if (request == null) { + break; + } + } + } + executorService.shutdown(); + } + stat.compareAndSet(STAT_RUNNING, STAT_STOPPED); + // release some resources + destroy(); + } + + protected void destroy() { + destroyEach(downloader); + destroyEach(pageProcessor); + for (Pipeline pipeline : pipelines) { + destroyEach(pipeline); + } + } + + private void destroyEach(Object object) { + if (object instanceof Closeable) { + try { + ((Closeable) object).close(); + } catch (IOException e) { + e.printStackTrace(); + } + } + } + + /** + * Process specific urls without url discovering. + * + * @param urls + * urls to process + */ + public void test(String... urls) { + checkComponent(); + if (urls.length > 0) { + for (String url : urls) { + processRequest(new Request(url)); + } + } + } + + protected void processRequest(Request request) { + Page page = downloader.download(request, this); + if (page == null) { + sleep(site.getSleepTime()); + return; + } + // for cycle retry + if (page.getHtml() == null) { + addRequest(page); + sleep(site.getSleepTime()); + return; + } + pageProcessor.process(page); + addRequest(page); + if (!page.getResultItems().isSkip()) { + for (Pipeline pipeline : pipelines) { + pipeline.process(page.getResultItems(), this); + } + } + sleep(site.getSleepTime()); + } + + protected void sleep(int time) { + try { + Thread.sleep(time); + } catch (InterruptedException e) { + e.printStackTrace(); + } + } + + protected void addRequest(Page page) { + if (CollectionUtils.isNotEmpty(page.getTargetRequests())) { + for (Request request : page.getTargetRequests()) { + scheduler.push(request, this); + } + } + } + + protected void checkIfRunning() { + if (!stat.compareAndSet(STAT_INIT, STAT_INIT) && !stat.compareAndSet(STAT_STOPPED, STAT_STOPPED)) { + throw new IllegalStateException("Spider is already running!"); + } + } + + public void runAsync() { + Thread thread = new Thread(this); + thread.setDaemon(false); + thread.start(); + } + + public void start() { + runAsync(); + } + + public void stop() { + if (stat.compareAndSet(STAT_RUNNING, STAT_STOPPED)) { + if (executorService != null) { + executorService.shutdown(); + } + logger.info("Spider " + getUUID() + " stop success!"); + } else { + logger.info("Spider " + getUUID() + " stop fail!"); + } + } + + public void stopAndDestroy() { + stop(); + destroy(); + } + + /** + * start with more than one threads + * + * @param threadNum + * @return this + */ + public Spider thread(int threadNum) { + checkIfRunning(); + this.threadNum = threadNum; + if (threadNum <= 0) { + throw new IllegalArgumentException("threadNum should be more than one!"); + } + if (threadNum == 1) { + return this; + } + return this; + } + + /** + * switch off xsoup + * + * @return + */ + public static void xsoupOff() { + EnvironmentUtil.setUseXsoup(false); + } + + @Override + public String getUUID() { + if (uuid != null) { + return uuid; + } + if (site != null) { + return site.getDomain(); + } + return null; + } + + @Override + public Site getSite() { + return site; + } } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/utils/ThreadUtils.java b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/ThreadUtils.java index 0d5666c9..ba9774db 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/utils/ThreadUtils.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/ThreadUtils.java @@ -1,7 +1,7 @@ package us.codecraft.webmagic.utils; import java.util.concurrent.ExecutorService; -import java.util.concurrent.LinkedBlockingQueue; +import java.util.concurrent.SynchronousQueue; import java.util.concurrent.ThreadPoolExecutor; import java.util.concurrent.TimeUnit; @@ -11,22 +11,11 @@ import java.util.concurrent.TimeUnit; */ public class ThreadUtils { - public static ExecutorService newFixedThreadPool(int threadSize) { - return new ThreadPoolExecutor(threadSize, threadSize, 0L, TimeUnit.MILLISECONDS, - new LinkedBlockingQueue(1) { - - private static final long serialVersionUID = -9028058603126367678L; - - @Override - public boolean offer(Runnable e) { - try { - put(e); - return true; - } catch (InterruptedException ie) { - Thread.currentThread().interrupt(); - } - return false; - } - }); - } + public static ExecutorService newFixedThreadPool(int threadSize) { + if (threadSize <= 1) { + throw new IllegalArgumentException("ThreadSize must be greater than 1!"); + } + return new ThreadPoolExecutor(threadSize - 1, threadSize - 1, 0L, TimeUnit.MILLISECONDS, + new SynchronousQueue(), new ThreadPoolExecutor.CallerRunsPolicy()); + } } diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/SpiderTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/SpiderTest.java index b3249ce2..75c1ba11 100644 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/SpiderTest.java +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/SpiderTest.java @@ -18,11 +18,12 @@ public class SpiderTest { public void process(ResultItems resultItems, Task task) { System.out.println(1); } - }); + }).thread(2); spider.start(); Thread.sleep(10000); spider.stop(); -// spider.run(); + Thread.sleep(10000); + spider.start(); Thread.sleep(10000); } } From b18216245b2130ccf9ed984011b799d0cb8ac8ad Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Mon, 23 Sep 2013 07:53:33 +0800 Subject: [PATCH 06/12] add type convert --- .../webmagic/model/FieldExtractor.java | 11 ++ .../webmagic/model/PageModelExtractor.java | 100 ++++++++++-- .../webmagic/model/annotation/Formatter.java | 41 +++++ .../model/formatter/BasicTypeFormatter.java | 150 ++++++++++++++++++ .../model/formatter/DateFormatter.java | 29 ++++ .../model/formatter/ObjectFormatter.java | 14 ++ .../model/formatter/ObjectFormatters.java | 27 ++++ .../codecraft/webmagic/model/GithubRepo.java | 12 +- 8 files changed, 368 insertions(+), 16 deletions(-) create mode 100644 webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/Formatter.java create mode 100644 webmagic-extension/src/main/java/us/codecraft/webmagic/model/formatter/BasicTypeFormatter.java create mode 100644 webmagic-extension/src/main/java/us/codecraft/webmagic/model/formatter/DateFormatter.java create mode 100644 webmagic-extension/src/main/java/us/codecraft/webmagic/model/formatter/ObjectFormatter.java create mode 100644 webmagic-extension/src/main/java/us/codecraft/webmagic/model/formatter/ObjectFormatters.java diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/FieldExtractor.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/FieldExtractor.java index 600e184a..a2cba133 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/FieldExtractor.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/FieldExtractor.java @@ -1,5 +1,6 @@ package us.codecraft.webmagic.model; +import us.codecraft.webmagic.model.formatter.ObjectFormatter; import us.codecraft.webmagic.selector.Selector; import java.lang.reflect.Field; @@ -16,6 +17,8 @@ class FieldExtractor extends Extractor { private Method setterMethod; + private ObjectFormatter objectFormatter; + public FieldExtractor(Field field, Selector selector, Source source, boolean notNull, boolean multi) { super(selector, source, notNull, multi); this.field = field; @@ -44,4 +47,12 @@ class FieldExtractor extends Extractor { boolean isNotNull() { return notNull; } + + ObjectFormatter getObjectFormatter() { + return objectFormatter; + } + + void setObjectFormatter(ObjectFormatter objectFormatter) { + this.objectFormatter = objectFormatter; + } } diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java index 54d942c1..370b0fb2 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java @@ -1,8 +1,12 @@ package us.codecraft.webmagic.model; import org.apache.commons.lang3.StringUtils; +import org.apache.log4j.Logger; import us.codecraft.webmagic.Page; import us.codecraft.webmagic.model.annotation.*; +import us.codecraft.webmagic.model.formatter.BasicTypeFormatter; +import us.codecraft.webmagic.model.formatter.ObjectFormatter; +import us.codecraft.webmagic.model.formatter.ObjectFormatters; import us.codecraft.webmagic.selector.*; import us.codecraft.webmagic.utils.ExtractorUtils; @@ -36,6 +40,8 @@ class PageModelExtractor { private Extractor objectExtractor; + private Logger logger = Logger.getLogger(getClass()); + public static PageModelExtractor create(Class clazz) { PageModelExtractor pageModelExtractor = new PageModelExtractor(); pageModelExtractor.init(clazz); @@ -62,16 +68,60 @@ class PageModelExtractor { fieldExtractor = fieldExtractorTmp; } if (fieldExtractor != null) { - if (!fieldExtractor.isMulti() && !String.class.isAssignableFrom(field.getType())) { - throw new IllegalStateException("Field " + field.getName() + " must be string"); - } else if (fieldExtractor.isMulti() && !List.class.isAssignableFrom(field.getType())) { - throw new IllegalStateException("Field " + field.getName() + " must be list"); - } + checkFormat(field, fieldExtractor); fieldExtractors.add(fieldExtractor); } } } + private void checkFormat(Field field, FieldExtractor fieldExtractor) { + if (!fieldExtractor.isMulti() && !String.class.isAssignableFrom(field.getType())) { + Class fieldClazz = BasicTypeFormatter.detectBasicClass(field.getType()); + ObjectFormatter objectFormatter = getObjectFormatter(field, fieldClazz); + if (objectFormatter == null) { + throw new IllegalStateException("Can't find formatter for field " + field.getName() + " of type " + fieldClazz); + } else { + fieldExtractor.setObjectFormatter(objectFormatter); + } + } else if (fieldExtractor.isMulti()) { + if (!List.class.isAssignableFrom(field.getType())) { + throw new IllegalStateException("Field " + field.getName() + " must be list"); + } + Formatter formatter = field.getAnnotation(Formatter.class); + if (formatter != null) { + if (!formatter.subClazz().equals(Void.class)) { + ObjectFormatter objectFormatter = getObjectFormatter(field, formatter.subClazz()); + if (objectFormatter == null) { + throw new IllegalStateException("Can't find formatter for field " + field.getName() + " of type " + formatter.subClazz()); + } else { + fieldExtractor.setObjectFormatter(objectFormatter); + } + } + } + } + } + + private ObjectFormatter getObjectFormatter(Field field, Class fieldClazz) { + Formatter formatter = field.getAnnotation(Formatter.class); + if (formatter != null) { + if (!formatter.formatter().equals(ObjectFormatter.class)) { + return initFormatter(formatter); + } + } + return ObjectFormatters.get(fieldClazz); + } + + private ObjectFormatter initFormatter(Formatter formatter) { + try { + return formatter.formatter().newInstance(); + } catch (InstantiationException e) { + logger.error("init ObjectFormatter fail", e); + } catch (IllegalAccessException e) { + logger.error("init ObjectFormatter fail", e); + } + return null; + } + private FieldExtractor getAnnotationExtractByUrl(Class clazz, Field field) { FieldExtractor fieldExtractor = null; ExtractByUrl extractByUrl = field.getAnnotation(ExtractByUrl.class); @@ -231,7 +281,12 @@ class PageModelExtractor { if ((value == null || value.size() == 0) && fieldExtractor.isNotNull()) { return null; } - setField(o, fieldExtractor, value); + if (fieldExtractor.getObjectFormatter() != null) { + List converted = convert(value, fieldExtractor.getObjectFormatter()); + setField(o, fieldExtractor, converted); + } else { + setField(o, fieldExtractor, value); + } } else { String value; switch (fieldExtractor.getSource()) { @@ -254,22 +309,47 @@ class PageModelExtractor { if (value == null && fieldExtractor.isNotNull()) { return null; } - setField(o, fieldExtractor, value); + if (fieldExtractor.getObjectFormatter() != null) { + Object converted = convert(value, fieldExtractor.getObjectFormatter()); + setField(o, fieldExtractor, converted); + } else { + setField(o, fieldExtractor, value); + } } } if (AfterExtractor.class.isAssignableFrom(clazz)) { ((AfterExtractor) o).afterProcess(page); } } catch (InstantiationException e) { - e.printStackTrace(); + logger.error("extract fail", e); } catch (IllegalAccessException e) { - e.printStackTrace(); + logger.error("extract fail", e); } catch (InvocationTargetException e) { - e.printStackTrace(); + logger.error("extract fail", e); } return o; } + private Object convert(String value, ObjectFormatter objectFormatter) { + try { + return objectFormatter.format(value); + } catch (Exception e) { + logger.error("convert " + value + " to " + objectFormatter.clazz() + " error!", e); + } + return null; + } + + private List convert(List values, ObjectFormatter objectFormatter) { + List objects = new ArrayList(); + for (String value : values) { + Object converted = convert(value, objectFormatter); + if (converted != null) { + objects.add(converted); + } + } + return objects; + } + private void setField(Object o, FieldExtractor fieldExtractor, Object value) throws IllegalAccessException, InvocationTargetException { if (fieldExtractor.getSetterMethod() != null) { fieldExtractor.getSetterMethod().invoke(o, value); diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/Formatter.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/Formatter.java new file mode 100644 index 00000000..e603c59f --- /dev/null +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/Formatter.java @@ -0,0 +1,41 @@ +package us.codecraft.webmagic.model.annotation; + +import us.codecraft.webmagic.model.formatter.ObjectFormatter; + +import java.lang.annotation.ElementType; +import java.lang.annotation.Retention; +import java.lang.annotation.Target; + +/** + * Define how the result string is convert to an object for field. + * + * @author code4crafter@gmail.com
+ * @since 0.3.2 + */ +@Retention(java.lang.annotation.RetentionPolicy.RUNTIME) +@Target({ElementType.FIELD}) +public @interface Formatter { + + /** + * Set formatter params. + * + * @return formatter params + */ + String[] value(); + + /** + * Specific the class of field of class of elements in collection for field.
+ * It is not necessary to be set because we can detect the class by class of field, + * unless you use a collection as a field.
+ * + * @return the class of field + */ + Class subClazz() default Void.class; + + /** + * If there are more than one formatter for a class, just specify the implement. + * @return implement + */ + Class formatter() default ObjectFormatter.class; + +} diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/formatter/BasicTypeFormatter.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/formatter/BasicTypeFormatter.java new file mode 100644 index 00000000..2669582a --- /dev/null +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/formatter/BasicTypeFormatter.java @@ -0,0 +1,150 @@ +package us.codecraft.webmagic.model.formatter; + +import java.util.Arrays; +import java.util.List; + +/** + * @author code4crafter@gmail.com + * @since 0.3.2 + */ +public abstract class BasicTypeFormatter implements ObjectFormatter { + + @Override + public void initParam(String[] extra) { + + } + + @Override + public T format(String raw) throws Exception { + if (raw == null) { + return null; + } + raw = raw.trim(); + return formatTrimmed(raw); + } + + protected abstract T formatTrimmed(String raw) throws Exception; + + public static final List basicTypeFormatters = Arrays.asList(new IntegerFormatter(), + new LongFormatter(), new DoubleFormatter(), new FloatFormatter(), new ShortFormatter(), + new CharactorFormatter(), new ByteFormatter(), new BooleanFormatter()); + + public static Class detectBasicClass(Class type) { + if (type.equals(Integer.TYPE) || type.equals(Integer.class)) { + return Integer.class; + } else if (type.equals(Long.TYPE) || type.equals(Long.class)) { + return Long.class; + } else if (type.equals(Double.TYPE) || type.equals(Double.class)) { + return Double.class; + } else if (type.equals(Float.TYPE) || type.equals(Float.class)) { + return Float.class; + } else if (type.equals(Short.TYPE) || type.equals(Short.class)) { + return Short.class; + } else if (type.equals(Character.TYPE) || type.equals(Character.class)) { + return Character.class; + } else if (type.equals(Byte.TYPE) || type.equals(Byte.class)) { + return Byte.class; + } else if (type.equals(Boolean.TYPE) || type.equals(Boolean.class)) { + return Boolean.class; + } + return type; + } + + public static class IntegerFormatter extends BasicTypeFormatter { + @Override + public Integer formatTrimmed(String raw) throws Exception { + return Integer.parseInt(raw); + } + + @Override + public Class clazz() { + return Integer.class; + } + } + + public static class LongFormatter extends BasicTypeFormatter { + @Override + public Long formatTrimmed(String raw) throws Exception { + return Long.parseLong(raw); + } + + @Override + public Class clazz() { + return Long.class; + } + } + + public static class DoubleFormatter extends BasicTypeFormatter { + @Override + public Double formatTrimmed(String raw) throws Exception { + return Double.parseDouble(raw); + } + + @Override + public Class clazz() { + return Double.class; + } + } + + public static class FloatFormatter extends BasicTypeFormatter { + @Override + public Float formatTrimmed(String raw) throws Exception { + return Float.parseFloat(raw); + } + + @Override + public Class clazz() { + return Float.class; + } + } + + public static class ShortFormatter extends BasicTypeFormatter { + @Override + public Short formatTrimmed(String raw) throws Exception { + return Short.parseShort(raw); + } + + @Override + public Class clazz() { + return Short.class; + } + } + + public static class CharactorFormatter extends BasicTypeFormatter { + @Override + public Character formatTrimmed(String raw) throws Exception { + return raw.charAt(0); + } + + @Override + public Class clazz() { + return Character.class; + } + } + + public static class ByteFormatter extends BasicTypeFormatter { + @Override + public Byte formatTrimmed(String raw) throws Exception { + return Byte.parseByte(raw, 10); + } + + @Override + public Class clazz() { + return Byte.class; + } + } + + public static class BooleanFormatter extends BasicTypeFormatter { + @Override + public Boolean formatTrimmed(String raw) throws Exception { + return Boolean.parseBoolean(raw); + } + + @Override + public Class clazz() { + return Boolean.class; + } + } + + +} diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/formatter/DateFormatter.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/formatter/DateFormatter.java new file mode 100644 index 00000000..0ad0302b --- /dev/null +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/formatter/DateFormatter.java @@ -0,0 +1,29 @@ +package us.codecraft.webmagic.model.formatter; + +import org.apache.commons.lang3.time.DateUtils; + +import java.util.Date; + +/** + * @author code4crafter@gmail.com + * @since 0.3.2 + */ +public class DateFormatter implements ObjectFormatter { + + private String[] datePatterns = new String[]{"YYYY-MM-dd HH:mm"}; + + @Override + public Date format(String raw) throws Exception { + return DateUtils.parseDate(raw, datePatterns); + } + + @Override + public Class clazz() { + return Date.class; + } + + @Override + public void initParam(String[] extra) { + datePatterns = extra; + } +} diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/formatter/ObjectFormatter.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/formatter/ObjectFormatter.java new file mode 100644 index 00000000..aea7272b --- /dev/null +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/formatter/ObjectFormatter.java @@ -0,0 +1,14 @@ +package us.codecraft.webmagic.model.formatter; + +/** + * @author code4crafter@gmail.com + */ +public interface ObjectFormatter { + + T format(String raw) throws Exception; + + Class clazz(); + + void initParam(String[] extra); + +} diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/formatter/ObjectFormatters.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/formatter/ObjectFormatters.java new file mode 100644 index 00000000..6dedc3ce --- /dev/null +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/formatter/ObjectFormatters.java @@ -0,0 +1,27 @@ +package us.codecraft.webmagic.model.formatter; + +import java.util.Map; +import java.util.concurrent.ConcurrentHashMap; + +/** + * @author code4crafter@gmail.com + * @since 0.3.2 + */ +public class ObjectFormatters { + + private static Map formatterMap = new ConcurrentHashMap(); + + static { + for (ObjectFormatter basicTypeFormatter : BasicTypeFormatter.basicTypeFormatters) { + put(basicTypeFormatter); + } + } + + public static void put(ObjectFormatter objectFormatter) { + formatterMap.put(objectFormatter.clazz(), objectFormatter); + } + + public static ObjectFormatter get(Class clazz){ + return formatterMap.get(clazz); + } +} diff --git a/webmagic-extension/src/test/java/us/codecraft/webmagic/model/GithubRepo.java b/webmagic-extension/src/test/java/us/codecraft/webmagic/model/GithubRepo.java index 5b6319a0..a9e049b1 100644 --- a/webmagic-extension/src/test/java/us/codecraft/webmagic/model/GithubRepo.java +++ b/webmagic-extension/src/test/java/us/codecraft/webmagic/model/GithubRepo.java @@ -32,10 +32,10 @@ public class GithubRepo implements HasKey { private List language; @ExtractBy("//ul[@class='pagehead-actions']/li[2]//a[@class='social-count js-social-count']/text()") - private String star; + private int star; @ExtractBy("//ul[@class='pagehead-actions']/li[3]//a[@class='social-count']/text()") - private String fork; + private int fork; @ExtractByUrl private String url; @@ -46,8 +46,8 @@ public class GithubRepo implements HasKey { , new PageModelPipeline() { @Override public void process(GithubRepo o, Task task) { - Assert.assertEquals("78",o.getStar().trim()); - Assert.assertEquals("65",o.getFork().trim()); + Assert.assertEquals(78, o.getStar()); + Assert.assertEquals(65, o.getFork()); } }, GithubRepo.class).setDownloader(new MockDownloader()).test("https://github.com/code4craft/webmagic"); } @@ -77,11 +77,11 @@ public class GithubRepo implements HasKey { return url; } - public String getStar() { + public int getStar() { return star; } - public String getFork() { + public int getFork() { return fork; } } From 250cc5e66235eab0e13d6b6e99736805a7c11399 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Mon, 23 Sep 2013 08:17:21 +0800 Subject: [PATCH 07/12] change formatter to class --- .../webmagic/example}/GithubRepo.java | 24 +++-------------- .../webmagic/model/PageModelExtractor.java | 8 +++--- .../model/formatter/BasicTypeFormatter.java | 6 ++--- .../model/formatter/ObjectFormatters.java | 17 ++++++++---- .../webmagic/model/GithubRepoTest.java | 26 +++++++++++++++++++ .../webmagic/model/samples/OschinaBlog.java | 12 ++++----- 6 files changed, 54 insertions(+), 39 deletions(-) rename webmagic-extension/src/{test/java/us/codecraft/webmagic/model => main/java/us/codecraft/webmagic/example}/GithubRepo.java (66%) create mode 100644 webmagic-extension/src/test/java/us/codecraft/webmagic/model/GithubRepoTest.java diff --git a/webmagic-extension/src/test/java/us/codecraft/webmagic/model/GithubRepo.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/example/GithubRepo.java similarity index 66% rename from webmagic-extension/src/test/java/us/codecraft/webmagic/model/GithubRepo.java rename to webmagic-extension/src/main/java/us/codecraft/webmagic/example/GithubRepo.java index a9e049b1..d9501154 100644 --- a/webmagic-extension/src/test/java/us/codecraft/webmagic/model/GithubRepo.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/example/GithubRepo.java @@ -1,10 +1,6 @@ -package us.codecraft.webmagic.model; +package us.codecraft.webmagic.example; -import junit.framework.Assert; -import org.junit.Test; -import us.codecraft.webmagic.MockDownloader; -import us.codecraft.webmagic.Site; -import us.codecraft.webmagic.Task; +import us.codecraft.webmagic.model.HasKey; import us.codecraft.webmagic.model.annotation.ExtractBy; import us.codecraft.webmagic.model.annotation.ExtractByUrl; import us.codecraft.webmagic.model.annotation.HelpUrl; @@ -25,10 +21,10 @@ public class GithubRepo implements HasKey { @ExtractByUrl("https://github\\.com/(\\w+)/.*") private String author; - @ExtractBy("//div[@id='readme']") + @ExtractBy("//div[@id='readme']/tidyText()") private String readme; - @ExtractBy(value = "//div[@class='repository-lang-stats']//li//span[@class='lang']", multi = true) + @ExtractBy(value = "//div[@class='repository-lang-stats']//li//span[@class='lang']/text()", multi = true) private List language; @ExtractBy("//ul[@class='pagehead-actions']/li[2]//a[@class='social-count js-social-count']/text()") @@ -40,18 +36,6 @@ public class GithubRepo implements HasKey { @ExtractByUrl private String url; - @Test - public void test() { - OOSpider.create(Site.me().addStartUrl("https://github.com/code4craft/webmagic").setSleepTime(0) - , new PageModelPipeline() { - @Override - public void process(GithubRepo o, Task task) { - Assert.assertEquals(78, o.getStar()); - Assert.assertEquals(65, o.getFork()); - } - }, GithubRepo.class).setDownloader(new MockDownloader()).test("https://github.com/code4craft/webmagic"); - } - @Override public String key() { return author + ":" + name; diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java index 370b0fb2..cd3e72b4 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java @@ -105,15 +105,15 @@ class PageModelExtractor { Formatter formatter = field.getAnnotation(Formatter.class); if (formatter != null) { if (!formatter.formatter().equals(ObjectFormatter.class)) { - return initFormatter(formatter); + return initFormatter(formatter.formatter()); } } - return ObjectFormatters.get(fieldClazz); + return initFormatter(ObjectFormatters.get(fieldClazz)); } - private ObjectFormatter initFormatter(Formatter formatter) { + private ObjectFormatter initFormatter(Class formatterClazz) { try { - return formatter.formatter().newInstance(); + return formatterClazz.newInstance(); } catch (InstantiationException e) { logger.error("init ObjectFormatter fail", e); } catch (IllegalAccessException e) { diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/formatter/BasicTypeFormatter.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/formatter/BasicTypeFormatter.java index 2669582a..f9d76a84 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/formatter/BasicTypeFormatter.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/formatter/BasicTypeFormatter.java @@ -25,9 +25,9 @@ public abstract class BasicTypeFormatter implements ObjectFormatter { protected abstract T formatTrimmed(String raw) throws Exception; - public static final List basicTypeFormatters = Arrays.asList(new IntegerFormatter(), - new LongFormatter(), new DoubleFormatter(), new FloatFormatter(), new ShortFormatter(), - new CharactorFormatter(), new ByteFormatter(), new BooleanFormatter()); + public static final List> basicTypeFormatters = Arrays.>asList(IntegerFormatter.class, + LongFormatter.class, DoubleFormatter.class, FloatFormatter.class, ShortFormatter.class, + CharactorFormatter.class, ByteFormatter.class, BooleanFormatter.class); public static Class detectBasicClass(Class type) { if (type.equals(Integer.TYPE) || type.equals(Integer.class)) { diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/formatter/ObjectFormatters.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/formatter/ObjectFormatters.java index 6dedc3ce..7534e5ea 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/formatter/ObjectFormatters.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/formatter/ObjectFormatters.java @@ -9,19 +9,26 @@ import java.util.concurrent.ConcurrentHashMap; */ public class ObjectFormatters { - private static Map formatterMap = new ConcurrentHashMap(); + private static Map> formatterMap = new ConcurrentHashMap>(); static { - for (ObjectFormatter basicTypeFormatter : BasicTypeFormatter.basicTypeFormatters) { + for (Class basicTypeFormatter : BasicTypeFormatter.basicTypeFormatters) { put(basicTypeFormatter); } + put(DateFormatter.class); } - public static void put(ObjectFormatter objectFormatter) { - formatterMap.put(objectFormatter.clazz(), objectFormatter); + public static void put(Class objectFormatter) { + try { + formatterMap.put(objectFormatter.newInstance().clazz(), objectFormatter); + } catch (InstantiationException e) { + e.printStackTrace(); + } catch (IllegalAccessException e) { + e.printStackTrace(); + } } - public static ObjectFormatter get(Class clazz){ + public static Class get(Class clazz){ return formatterMap.get(clazz); } } diff --git a/webmagic-extension/src/test/java/us/codecraft/webmagic/model/GithubRepoTest.java b/webmagic-extension/src/test/java/us/codecraft/webmagic/model/GithubRepoTest.java new file mode 100644 index 00000000..97555503 --- /dev/null +++ b/webmagic-extension/src/test/java/us/codecraft/webmagic/model/GithubRepoTest.java @@ -0,0 +1,26 @@ +package us.codecraft.webmagic.model; + +import junit.framework.Assert; +import org.junit.Test; +import us.codecraft.webmagic.MockDownloader; +import us.codecraft.webmagic.Site; +import us.codecraft.webmagic.Task; +import us.codecraft.webmagic.example.GithubRepo; + +/** + * @author code4crafter@gmail.com
+ */ +public class GithubRepoTest { + + @Test + public void test() { + OOSpider.create(Site.me().addStartUrl("https://github.com/code4craft/webmagic").setSleepTime(0) + , new PageModelPipeline() { + @Override + public void process(GithubRepo o, Task task) { + Assert.assertEquals(78, o.getStar()); + Assert.assertEquals(65, o.getFork()); + } + }, GithubRepo.class).setDownloader(new MockDownloader()).test("https://github.com/code4craft/webmagic"); + } +} diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/OschinaBlog.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/OschinaBlog.java index 7819b446..8e6602cc 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/OschinaBlog.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/OschinaBlog.java @@ -1,19 +1,19 @@ package us.codecraft.webmagic.model.samples; import us.codecraft.webmagic.Site; -import us.codecraft.webmagic.model.HasKey; import us.codecraft.webmagic.model.OOSpider; import us.codecraft.webmagic.model.annotation.ExtractBy; import us.codecraft.webmagic.model.annotation.TargetUrl; import us.codecraft.webmagic.pipeline.JsonFilePageModelPipeline; +import java.util.Date; import java.util.List; /** * @author code4crafter@gmail.com
*/ @TargetUrl("http://my.oschina.net/flashsword/blog/\\d+") -public class OschinaBlog implements HasKey{ +public class OschinaBlog{ @ExtractBy("//title") private String title; @@ -24,16 +24,14 @@ public class OschinaBlog implements HasKey{ @ExtractBy(value = "//div[@class='BlogTags']/a/text()", multi = true) private List tags; + @ExtractBy("//div[class='BlogStat']/regex('\\d{4}-\\d{1,2}-\\d{1,2} \\d{1,2}:\\d{1,2}')") + private Date date; + public static void main(String[] args) { OOSpider.create(Site.me().addStartUrl("http://my.oschina.net/flashsword/blog") ,new JsonFilePageModelPipeline(), OschinaBlog.class).run(); } - @Override - public String key() { - return title; - } - public String getTitle() { return title; } From 95ab4edec3daca3353395909a13085079ff8606b Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Mon, 23 Sep 2013 08:38:54 +0800 Subject: [PATCH 08/12] some bugfix --- .../main/java/us/codecraft/webmagic/Site.java | 10 ++-- .../webmagic/example/OschinaBlog.java | 59 +++++++++++++++++++ .../webmagic/model/samples/OschinaBlog.java | 4 -- 3 files changed, 64 insertions(+), 9 deletions(-) create mode 100644 webmagic-extension/src/main/java/us/codecraft/webmagic/example/OschinaBlog.java diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java index 6a351786..4c7b9928 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java @@ -96,11 +96,6 @@ public class Site { * @return get domain */ public String getDomain() { - if (domain == null) { - if (startUrls.size() > 0) { - domain = UrlUtils.getDomain(startUrls.get(0)); - } - } return domain; } @@ -176,6 +171,11 @@ public class Site { */ public Site addStartUrl(String startUrl) { this.startUrls.add(startUrl); + if (domain == null) { + if (startUrls.size() > 0) { + domain = UrlUtils.getDomain(startUrls.get(0)); + } + } return this; } diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/example/OschinaBlog.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/example/OschinaBlog.java new file mode 100644 index 00000000..703d6a4f --- /dev/null +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/example/OschinaBlog.java @@ -0,0 +1,59 @@ +package us.codecraft.webmagic.example; + +import us.codecraft.webmagic.Page; +import us.codecraft.webmagic.Site; +import us.codecraft.webmagic.model.AfterExtractor; +import us.codecraft.webmagic.model.OOSpider; +import us.codecraft.webmagic.model.annotation.ExtractBy; +import us.codecraft.webmagic.model.annotation.Formatter; +import us.codecraft.webmagic.model.annotation.TargetUrl; +import us.codecraft.webmagic.pipeline.JsonFilePageModelPipeline; + +import java.util.List; + +/** + * @author code4crafter@gmail.com
+ */ +@TargetUrl("http://my.oschina.net/flashsword/blog/\\d+") +public class OschinaBlog implements AfterExtractor{ + + @ExtractBy("//title/text()") + private String title; + + @ExtractBy(value = "div.BlogContent",type = ExtractBy.Type.Css) + private String content; + + @ExtractBy(value = "//div[@class='BlogTags']/a/text()", multi = true) + private List tags; + + @Formatter("YYYY-MM-dd HH:mm") + @ExtractBy("//div[@class='BlogStat']/regex('\\d+-\\d+-\\d+\\s+\\d+:\\d+')") + private String date; + + public static void main(String[] args) { + OOSpider.create(Site.me().addStartUrl("http://my.oschina.net/flashsword/blog") + ,new JsonFilePageModelPipeline("/data/webmagic/"), OschinaBlog.class).run(); + } + + public String getTitle() { + return title; + } + + public String getContent() { + return content; + } + + public List getTags() { + return tags; + } + +// public Date getDate() { +// return date; +// } + + @Override + public void afterProcess(Page page) { + System.out.println(date); + System.out.println(title); + } +} diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/OschinaBlog.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/OschinaBlog.java index 8e6602cc..a7f51adf 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/OschinaBlog.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/OschinaBlog.java @@ -6,7 +6,6 @@ import us.codecraft.webmagic.model.annotation.ExtractBy; import us.codecraft.webmagic.model.annotation.TargetUrl; import us.codecraft.webmagic.pipeline.JsonFilePageModelPipeline; -import java.util.Date; import java.util.List; /** @@ -24,9 +23,6 @@ public class OschinaBlog{ @ExtractBy(value = "//div[@class='BlogTags']/a/text()", multi = true) private List tags; - @ExtractBy("//div[class='BlogStat']/regex('\\d{4}-\\d{1,2}-\\d{1,2} \\d{1,2}:\\d{1,2}')") - private Date date; - public static void main(String[] args) { OOSpider.create(Site.me().addStartUrl("http://my.oschina.net/flashsword/blog") ,new JsonFilePageModelPipeline(), OschinaBlog.class).run(); From b131878123cb90f6123255bbd21e71bc70a480b7 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Mon, 23 Sep 2013 13:01:28 +0800 Subject: [PATCH 09/12] add example --- .../example/GithubRepoPageProcesser.java | 36 +++++++++++++++++ .../example/OschinaBlogPageProcesser.java | 39 +++++++++++++++++++ .../webmagic/example/GithubRepo.java | 13 ++++++- .../webmagic/example/OschinaBlog.java | 25 +++++------- .../webmagic/model/PageModelExtractor.java | 15 ++++++- .../model/formatter/DateFormatter.java | 2 +- .../webmagic/formatter/DateFormatterTest.java | 20 ++++++++++ .../src/test/resouces/log4j.xml | 31 +++++++++++++++ 8 files changed, 161 insertions(+), 20 deletions(-) create mode 100644 webmagic-core/src/main/java/us/codecraft/webmagic/processor/example/GithubRepoPageProcesser.java create mode 100644 webmagic-core/src/main/java/us/codecraft/webmagic/processor/example/OschinaBlogPageProcesser.java create mode 100644 webmagic-extension/src/test/java/us/codecraft/webmagic/formatter/DateFormatterTest.java create mode 100644 webmagic-extension/src/test/resouces/log4j.xml diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/processor/example/GithubRepoPageProcesser.java b/webmagic-core/src/main/java/us/codecraft/webmagic/processor/example/GithubRepoPageProcesser.java new file mode 100644 index 00000000..0e7e3b92 --- /dev/null +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/processor/example/GithubRepoPageProcesser.java @@ -0,0 +1,36 @@ +package us.codecraft.webmagic.processor.example; + +import us.codecraft.webmagic.Page; +import us.codecraft.webmagic.Site; +import us.codecraft.webmagic.Spider; +import us.codecraft.webmagic.processor.PageProcessor; + +/** + * @author code4crafter@gmail.com
+ * @since 0.3.2 + */ +public class GithubRepoPageProcesser implements PageProcessor { + + private Site site = Site.me().addStartUrl("https://github.com/code4craft").setRetryTimes(3).setSleepTime(100); + + @Override + public void process(Page page) { + page.addTargetRequests(page.getHtml().links().regex("(https://github\\.com/\\w+/\\w+)").all()); + page.putField("author", page.getUrl().regex("https://github\\.com/(\\w+)/.*").toString()); + page.putField("name", page.getHtml().xpath("//h1[@class='entry-title public']/strong/a/text()").toString()); + if (page.getResultItems().get("name")==null){ + //skip this page + page.setSkip(true); + } + page.putField("readme", page.getHtml().xpath("//div[@id='readme']/tidyText()")); + } + + @Override + public Site getSite() { + return site; + } + + public static void main(String[] args) { + Spider.create(new GithubRepoPageProcesser()).thread(5).run(); + } +} diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/processor/example/OschinaBlogPageProcesser.java b/webmagic-core/src/main/java/us/codecraft/webmagic/processor/example/OschinaBlogPageProcesser.java new file mode 100644 index 00000000..fa8dab6d --- /dev/null +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/processor/example/OschinaBlogPageProcesser.java @@ -0,0 +1,39 @@ +package us.codecraft.webmagic.processor.example; + +import us.codecraft.webmagic.Page; +import us.codecraft.webmagic.Site; +import us.codecraft.webmagic.Spider; +import us.codecraft.webmagic.processor.PageProcessor; + +import java.util.List; + +/** + * @author code4crafter@gmail.com
+ */ +public class OschinaBlogPageProcesser implements PageProcessor { + + private Site site = Site.me().setDomain("my.oschina.net").addStartUrl("http://my.oschina.net/flashsword/blog"); + + @Override + public void process(Page page) { + List links = page.getHtml().links().regex("http://my\\.oschina\\.net/flashsword/blog/\\d+").all(); + page.addTargetRequests(links); + page.putField("title", page.getHtml().xpath("//div[@class='BlogEntity']/div[@class='BlogTitle']/h1/text()").toString()); + if (page.getResultItems().get("title") == null) { + //skip this page + page.setSkip(true); + } + page.putField("content", page.getHtml().xpath("//div[@class='BlogContent']/tidyText()").toString()); + page.putField("tags", page.getHtml().xpath("//div[@class='BlogTags']/a/text()").all()); + } + + @Override + public Site getSite() { + return site; + + } + + public static void main(String[] args) { + Spider.create(new OschinaBlogPageProcesser()).thread(2).run(); + } +} diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/example/GithubRepo.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/example/GithubRepo.java index d9501154..58441cbc 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/example/GithubRepo.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/example/GithubRepo.java @@ -1,6 +1,9 @@ package us.codecraft.webmagic.example; +import us.codecraft.webmagic.Site; +import us.codecraft.webmagic.model.ConsolePageModelPipeline; import us.codecraft.webmagic.model.HasKey; +import us.codecraft.webmagic.model.OOSpider; import us.codecraft.webmagic.model.annotation.ExtractBy; import us.codecraft.webmagic.model.annotation.ExtractByUrl; import us.codecraft.webmagic.model.annotation.HelpUrl; @@ -10,6 +13,7 @@ import java.util.List; /** * @author code4crafter@gmail.com
+ * @since 0.3.2 */ @TargetUrl("https://github.com/\\w+/\\w+") @HelpUrl({"https://github.com/\\w+\\?tab=repositories", "https://github.com/\\w+", "https://github.com/explore/*"}) @@ -27,15 +31,20 @@ public class GithubRepo implements HasKey { @ExtractBy(value = "//div[@class='repository-lang-stats']//li//span[@class='lang']/text()", multi = true) private List language; - @ExtractBy("//ul[@class='pagehead-actions']/li[2]//a[@class='social-count js-social-count']/text()") + @ExtractBy("//ul[@class='pagehead-actions']/li[1]//a[@class='social-count js-social-count']/text()") private int star; - @ExtractBy("//ul[@class='pagehead-actions']/li[3]//a[@class='social-count']/text()") + @ExtractBy("//ul[@class='pagehead-actions']/li[2]//a[@class='social-count']/text()") private int fork; @ExtractByUrl private String url; + public static void main(String[] args) { + OOSpider.create(Site.me().addStartUrl("https://github.com/code4craft").setSleepTime(100) + , new ConsolePageModelPipeline(), GithubRepo.class).thread(10).run(); + } + @Override public String key() { return author + ":" + name; diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/example/OschinaBlog.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/example/OschinaBlog.java index 703d6a4f..1545f885 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/example/OschinaBlog.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/example/OschinaBlog.java @@ -1,38 +1,38 @@ package us.codecraft.webmagic.example; -import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Site; -import us.codecraft.webmagic.model.AfterExtractor; import us.codecraft.webmagic.model.OOSpider; import us.codecraft.webmagic.model.annotation.ExtractBy; import us.codecraft.webmagic.model.annotation.Formatter; import us.codecraft.webmagic.model.annotation.TargetUrl; import us.codecraft.webmagic.pipeline.JsonFilePageModelPipeline; +import java.util.Date; import java.util.List; /** * @author code4crafter@gmail.com
+ * @since 0.3.2 */ @TargetUrl("http://my.oschina.net/flashsword/blog/\\d+") -public class OschinaBlog implements AfterExtractor{ +public class OschinaBlog { @ExtractBy("//title/text()") private String title; - @ExtractBy(value = "div.BlogContent",type = ExtractBy.Type.Css) + @ExtractBy(value = "div.BlogContent", type = ExtractBy.Type.Css) private String content; @ExtractBy(value = "//div[@class='BlogTags']/a/text()", multi = true) private List tags; - @Formatter("YYYY-MM-dd HH:mm") + @Formatter("yyyy-MM-dd HH:mm") @ExtractBy("//div[@class='BlogStat']/regex('\\d+-\\d+-\\d+\\s+\\d+:\\d+')") - private String date; + private Date date; public static void main(String[] args) { OOSpider.create(Site.me().addStartUrl("http://my.oschina.net/flashsword/blog") - ,new JsonFilePageModelPipeline("/data/webmagic/"), OschinaBlog.class).run(); + , new JsonFilePageModelPipeline("/data/webmagic/"), OschinaBlog.class).run(); } public String getTitle() { @@ -47,13 +47,8 @@ public class OschinaBlog implements AfterExtractor{ return tags; } -// public Date getDate() { -// return date; -// } - - @Override - public void afterProcess(Page page) { - System.out.println(date); - System.out.println(title); + public Date getDate() { + return date; } + } diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java index cd3e72b4..c78bd31e 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java @@ -105,7 +105,8 @@ class PageModelExtractor { Formatter formatter = field.getAnnotation(Formatter.class); if (formatter != null) { if (!formatter.formatter().equals(ObjectFormatter.class)) { - return initFormatter(formatter.formatter()); + ObjectFormatter objectFormatter = initFormatter(formatter.formatter()); + objectFormatter.initParam(formatter.value()); } } return initFormatter(ObjectFormatters.get(fieldClazz)); @@ -311,6 +312,9 @@ class PageModelExtractor { } if (fieldExtractor.getObjectFormatter() != null) { Object converted = convert(value, fieldExtractor.getObjectFormatter()); + if (converted == null && fieldExtractor.isNotNull()) { + return null; + } setField(o, fieldExtractor, converted); } else { setField(o, fieldExtractor, value); @@ -332,7 +336,11 @@ class PageModelExtractor { private Object convert(String value, ObjectFormatter objectFormatter) { try { - return objectFormatter.format(value); + Object format = objectFormatter.format(value); + if (logger.isDebugEnabled()) { + logger.debug("String " + value + " is converted to " + format); + } + return format; } catch (Exception e) { logger.error("convert " + value + " to " + objectFormatter.clazz() + " error!", e); } @@ -351,6 +359,9 @@ class PageModelExtractor { } private void setField(Object o, FieldExtractor fieldExtractor, Object value) throws IllegalAccessException, InvocationTargetException { + if (value==null){ + return; + } if (fieldExtractor.getSetterMethod() != null) { fieldExtractor.getSetterMethod().invoke(o, value); } diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/formatter/DateFormatter.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/formatter/DateFormatter.java index 0ad0302b..b0f6e771 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/formatter/DateFormatter.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/formatter/DateFormatter.java @@ -10,7 +10,7 @@ import java.util.Date; */ public class DateFormatter implements ObjectFormatter { - private String[] datePatterns = new String[]{"YYYY-MM-dd HH:mm"}; + private String[] datePatterns = new String[]{"yyyy-MM-dd HH:mm"}; @Override public Date format(String raw) throws Exception { diff --git a/webmagic-extension/src/test/java/us/codecraft/webmagic/formatter/DateFormatterTest.java b/webmagic-extension/src/test/java/us/codecraft/webmagic/formatter/DateFormatterTest.java new file mode 100644 index 00000000..a621e2dc --- /dev/null +++ b/webmagic-extension/src/test/java/us/codecraft/webmagic/formatter/DateFormatterTest.java @@ -0,0 +1,20 @@ +package us.codecraft.webmagic.formatter; + +import org.junit.Test; +import us.codecraft.webmagic.model.formatter.DateFormatter; + +import java.util.Date; + +/** + * @author code4crafter@gmail.com + */ +public class DateFormatterTest { + + @Test + public void testDateFormatter() throws Exception { + DateFormatter dateFormatter = new DateFormatter(); + dateFormatter.initParam(new String[]{"yyyy-MM-dd HH:mm"}); + Date format = dateFormatter.format("2013-09-10 22:11"); + System.out.println(format); + } +} diff --git a/webmagic-extension/src/test/resouces/log4j.xml b/webmagic-extension/src/test/resouces/log4j.xml new file mode 100644 index 00000000..a58e889b --- /dev/null +++ b/webmagic-extension/src/test/resouces/log4j.xml @@ -0,0 +1,31 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + From 6f18eec77eda0118358783d31f0937f09e32bb16 Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Mon, 23 Sep 2013 13:07:33 +0800 Subject: [PATCH 10/12] fix a test error --- .../us/codecraft/webmagic/MockDownloader.java | 375 +++++------------- .../webmagic/model/GithubRepoTest.java | 4 +- 2 files changed, 98 insertions(+), 281 deletions(-) diff --git a/webmagic-extension/src/test/java/us/codecraft/webmagic/MockDownloader.java b/webmagic-extension/src/test/java/us/codecraft/webmagic/MockDownloader.java index 8114b040..aa62e9e0 100644 --- a/webmagic-extension/src/test/java/us/codecraft/webmagic/MockDownloader.java +++ b/webmagic-extension/src/test/java/us/codecraft/webmagic/MockDownloader.java @@ -18,7 +18,7 @@ public class MockDownloader implements Downloader{ " \n" + " \n" + " \n" + - " code4craft/webmagic\n" + + " code4craft/webmagic · GitHub\n" + " \n" + " \n" + " \n" + @@ -27,7 +27,7 @@ public class MockDownloader implements Downloader{ " \n" + " \n" + " \n" + - " \n" + + " \n" + " \n" + " \n" + " \n" + @@ -38,7 +38,7 @@ public class MockDownloader implements Downloader{ " \n" + " \n" + " \n" + - " \n" + + " \n" + " \n" + "\n" + " \n" + @@ -48,15 +48,16 @@ public class MockDownloader implements Downloader{ " \n" + "\n" + "\n" + - " \n" + - " \n" + + " \n" + + " \n" + " \n" + "\n" + + " \n" + "\n" + - " \n" + - " \n" + + " \n" + + " \n" + " \n" + - " \n" + + " \n" + "\n" + " \n" + " \n" + @@ -73,37 +74,42 @@ public class MockDownloader implements Downloader{ " \n" + "\n" + "\n" + - " \n" + + " \n" + "
\n" + " \n" + " \n" + " \n" + "\n" + "\n" + - "
\n" + + " \n" + + "
\n" + "
\n" + "\n" + - " \n" + - " \n" + - "\n" + - "\n" + - "
\n" + + " \n" + + " \n" + + " \n" + "\n" + - " \n" + - " \n" + - " \n" + - "
\n" + + "
\n" + + " Sign up\n" + + " Sign in\n" + + "
\n" + "\n" + + "
\n" + "\n" + - "
\n" + - "
\n" + + " \n" + + " \n" + "\n" + "\n" + "\n" + " \n" + @@ -140,84 +146,15 @@ public class MockDownloader implements Downloader{ " \n" + "\n" + "
\n" + - " \n" + - "
\n" + - "\n" + - " \n" + - "\n" + - "\n" + - " \n" + - "\n" + - "
\n" + - " \n" + - "\n" + - "\n" + - "\n" + - "
\n" + - "\n" + + "
\n" + "\n" + - " \n" + "
\n" + "
\n" + "\n" + - " \n" + "\n" + " \n" + "\n" + "\n" + - "\n" + - "\n" + "
\n" + " \n" + "
\n" + @@ -226,100 +163,29 @@ public class MockDownloader implements Downloader{ "\n" + "
    \n" + "\n" + - "
  • \n" + - "
    \n" + - "\n" + - "
    \n" + - " \n" + - " 23\n" + - " \n" + - " \n" + - " \n" + - " \n" + - " Unwatch\n" + - " \n" + - " \n" + - "\n" + - "
    \n" + - "
    \n" + - "
    \n" + - " Notification status\n" + - " \n" + - "
    \n" + - "\n" + - "
    \n" + - "\n" + - "
    \n" + - " \n" + - "
    \n" + - " \n" + - "

    Not watching

    \n" + - " You only receive notifications for discussions in which you participate or are @mentioned.\n" + - " \n" + - " \n" + - " Watch\n" + - " \n" + - "
    \n" + - "
    \n" + - "\n" + - "
    \n" + - " \n" + - "
    \n" + - " \n" + - "

    Watching

    \n" + - " You receive notifications for all discussions in this repository.\n" + - " \n" + - " \n" + - " Unwatch\n" + - " \n" + - "
    \n" + - "
    \n" + - "\n" + - "
    \n" + - " \n" + - "
    \n" + - " \n" + - "

    Ignoring

    \n" + - " You do not receive any notifications for discussions in this repository.\n" + - " \n" + - " \n" + - " Stop ignoring\n" + - " \n" + - "
    \n" + - "
    \n" + - "\n" + - "
    \n" + - "\n" + - "
    \n" + - "
    \n" + - "
    \n" + - "\n" + - "
    \n" + - "
  • \n" + "\n" + "
  • \n" + - " \n" + - "
    \n" + - " \n" + - " Unstar\n" + - " \n" + - " \n" + - " Star\n" + - " \n" + - " 78\n" + - "
    \n" + + " \n" + + " Star\n" + + "\n" + + "\n" + + " 86\n" + + "\n" + "\n" + "
  • \n" + "\n" + - "\n" + - "
  • \n" + - " \n" + - " Fork\n" + - " \n" + - " 65\n" + - "
  • \n" + - "\n" + - "\n" + + "
  • \n" + + " \n" + + " Fork\n" + + " \n" + + " \n" + + " 70\n" + + " \n" + + "
  • \n" + "
\n" + "\n" + "

\n" + @@ -357,7 +223,7 @@ public class MockDownloader implements Downloader{ "
  • \n" + " \n" + " Issues\n" + - " 7\n" + + " 2\n" + " \"Octocat-spinner-32\"\n" + "
  • \n" + "\n" + @@ -397,15 +263,6 @@ public class MockDownloader implements Downloader{ " \n" + "\n" + "\n" + - "
    \n" + - " \n" + "

    \n" + "
    \n" + "\n" + @@ -416,9 +273,8 @@ public class MockDownloader implements Downloader{ "\n" + "
    \n" + + " data-url=\"/users/set_protocol?protocol_selector=http&protocol_type=clone\">\n" + "

    HTTPS clone URL

    \n" + - "\n" + "
    \n" + " \n" + @@ -430,25 +286,9 @@ public class MockDownloader implements Downloader{ " \n" + "\n" + "
    \n" + - "

    SSH clone URL

    \n" + - "\n" + - "
    \n" + - " \n" + - "\n" + - " \n" + - "
    \n" + - "
    \n" + - "\n" + - " \n" + - "\n" + - "
    \n" + + " data-url=\"/users/set_protocol?protocol_selector=subversion&protocol_type=clone\">\n" + "

    Subversion checkout URL

    \n" + - "\n" + "
    \n" + " \n" + @@ -458,15 +298,17 @@ public class MockDownloader implements Downloader{ "
    \n" + "\n" + "\n" + - "\n" + "

    You can clone with\n" + - " HTTPS,\n" + - " SSH,\n" + - " Subversion,\n" + - " and other methods.\n" + + " HTTPS,\n" + + " or Subversion.\n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + "

    \n" + "\n" + - " \n" + + " \n" + " \n" + " Clone in Desktop\n" + " \n" + @@ -492,22 +334,7 @@ public class MockDownloader implements Downloader{ "
    \n" + "\n" + "\n" + - " Edit\n" + - "
    \n" + - "\n" + - "
    \n" + - " \n" + - " \n" + - "
    \n" + - "\n" + - "
    \n" + - " \n" + - " \n" + - "
    \n" + "\n" + - " \n" + - " or cancel\n" + - "\n" + "
    \n" + "\n" + "
    \n" + @@ -519,7 +346,7 @@ public class MockDownloader implements Downloader{ " \n" + " \n" + " \n" + - " 299\n" + + " 311\n" + " \n" + " commits\n" + " \n" + @@ -538,7 +365,7 @@ public class MockDownloader implements Downloader{ " \n" + " \n" + " \n" + - " 4\n" + + " 5\n" + " \n" + " releases\n" + " \n" + @@ -561,7 +388,7 @@ public class MockDownloader implements Downloader{ " \n" + " \n" + " Java\n" + - " 100.0%\n" + + " 100%\n" + " \n" + " \n" + " \n" + @@ -574,14 +401,10 @@ public class MockDownloader implements Downloader{ " class=\"repository-lang-stats-graph js-toggle-lang-stats tooltipped downwards\"\n" + " title=\"Show language statistics\"\n" + " style=\"background-color:#b07219\">\n" + - " Java\n" + + " Java\n" + " \n" + "\n" + "\n" + - " \n" + - "
    \n" + - "\n" + "\n" + "\n" + "
    \n" + @@ -595,7 +418,8 @@ public class MockDownloader implements Downloader{ "
    \n" + " \n" + + " data-ref=\"master\"\n" + + " role=\"button\" aria-label=\"Switch branches or tags\" tabindex=\"0\">\n" + " \n" + " branch:\n" + " master\n" + @@ -611,7 +435,7 @@ public class MockDownloader implements Downloader{ "\n" + "
    \n" + "
    \n" + - " \n" + + " \n" + "
    \n" + "
    \n" + "
      \n" + @@ -648,17 +472,7 @@ public class MockDownloader implements Downloader{ "
    \n" + "
    \n" + "\n" + - "
    \n" + - " \n" + - "
    \n" + - "

    Create branch:

    \n" + - " from ‘master’\n" + - "
    \n" + - " \n" + - " \n" + - " \n" + - "
    \n" + - "\n" + + "
    Nothing to show
    \n" + "
    \n" + "\n" + "
    \n" + @@ -667,6 +481,10 @@ public class MockDownloader implements Downloader{ "\n" + "
    \n" + " \n" + + " webmagic-parent-0.3.1\n" + + "
    \n" + + "
    \n" + + " \n" + " webmagic-parent-0.2.1\n" + "
    \n" + "
    \n" + @@ -691,13 +509,13 @@ public class MockDownloader implements Downloader{ "
    \n" + "\n" + "\n" + - " \n" + + " \n" + "
    \n" + "\n" + "\n" + "\n" + "Show File Finder\n" + + " data-hotkey=\"t\" class=\"js-show-file-finder\" style=\"display:none\" data-pjax>Show File Finder\n" + "
    \n" + " \n" + " \n" + @@ -735,12 +553,12 @@ public class MockDownloader implements Downloader{ " \"Octocat-spinner-32\"\n" + " \n" + " \n" + " \n" + - " \n" + + " \n" + " \n" + " \n" + " \n" + " \n" + " \n" + - " \n" + + " \n" + " \n" + " \n" + " \n" + " \n" + " \n" + - " \n" + + " \n" + " \n" + " \n" + " \n" + " \n" + " \n" + - " \n" + + " \n" + " \n" + " \n" + " \n" + " \n" + " \n" + - " \n" + + " \n" + " \n" + " \n" + " \n" + " \n" + " \n" + - " \n" + + " \n" + " \n" + " \n" + "
    \n" + - " webmagic-core\n" + + " webmagic-core\n" + " \n" + - " fix null pointe exception #26\n" + + " add example\n" + "
    \n" + @@ -748,12 +566,12 @@ public class MockDownloader implements Downloader{ " \"Octocat-spinner-32\"\n" + " \n" + - " webmagic-extension\n" + + " webmagic-extension\n" + " \n" + - " fix null pointe exception #26\n" + + " add example\n" + "
    \n" + @@ -774,12 +592,12 @@ public class MockDownloader implements Downloader{ " \"Octocat-spinner-32\"\n" + " \n" + - " webmagic-samples\n" + + " webmagic-samples\n" + " \n" + - " update version for samples\n" + + " some bugfix\n" + "
    \n" + @@ -813,12 +631,12 @@ public class MockDownloader implements Downloader{ " \"Octocat-spinner-32\"\n" + " \n" + - " zh_docs\n" + + " zh_docs\n" + " \n" + - " update version\n" + + " update version\n" + "
    \n" + @@ -852,12 +670,12 @@ public class MockDownloader implements Downloader{ " \"Octocat-spinner-32\"\n" + " \n" + - " README.md\n" + + " README.md\n" + " \n" + - " update version\n" + + " update version\n" + "
    \n" + @@ -865,12 +683,12 @@ public class MockDownloader implements Downloader{ " \"Octocat-spinner-32\"\n" + " \n" + - " pom.xml\n" + + " pom.xml\n" + " \n" + - " 将单元测试fork独立的JVM来跑。避免少数情况默认maven开的JVM堆太小。\n" + + " [maven-release-plugin] prepare for next development iteration\n" + "
    \n" + @@ -932,12 +750,12 @@ public class MockDownloader implements Downloader{ "
        <dependency>\n" +
                 "        <groupId>us.codecraft</groupId>\n" +
                 "        <artifactId>webmagic-core</artifactId>\n" +
    -            "        <version>0.3.0</version>\n" +
    +            "        <version>0.3.1</version>\n" +
                 "    </dependency>\n" +
                 "    <dependency>\n" +
                 "        <groupId>us.codecraft</groupId>\n" +
                 "        <artifactId>webmagic-extension</artifactId>\n" +
    -            "        <version>0.3.0</version>\n" +
    +            "        <version>0.3.1</version>\n" +
                 "    </dependency>\n" +
                 "
    \n" + "\n" + @@ -949,7 +767,7 @@ public class MockDownloader implements Downloader{ "\n" + "

    Write a class implements PageProcessor:

    \n" + "\n" + - "
        public class OschinaBlogPageProcesser implements PageProcessor {\n" +
    +            "
        public class OschinaBlogPageProcesser implements PageProcessor {\n" +
                 "\n" +
                 "        private Site site = Site.me().setDomain(\"my.oschina.net\")\n" +
                 "           .addStartUrl(\"http://my.oschina.net/flashsword/blog\");\n" +
    @@ -984,7 +802,7 @@ public class MockDownloader implements Downloader{
                 "\n" +
                 "

    You can also use annotation way:

    \n" + "\n" + - "
        @TargetUrl(\"http://my.oschina.net/flashsword/blog/\\\\d+\")\n" +
    +            "
        @TargetUrl(\"http://my.oschina.net/flashsword/blog/\\\\d+\")\n" +
                 "    public class OschinaBlog {\n" +
                 "\n" +
                 "        @ExtractBy(\"//title\")\n" +
    @@ -1071,7 +889,7 @@ public class MockDownloader implements Downloader{
                 "    \n" +
                 "\n" +
                 "    
    \n" + "\n" + - " \n" + " \n" + "\n" + "\n"; diff --git a/webmagic-extension/src/test/java/us/codecraft/webmagic/model/GithubRepoTest.java b/webmagic-extension/src/test/java/us/codecraft/webmagic/model/GithubRepoTest.java index 97555503..d6e1bf02 100644 --- a/webmagic-extension/src/test/java/us/codecraft/webmagic/model/GithubRepoTest.java +++ b/webmagic-extension/src/test/java/us/codecraft/webmagic/model/GithubRepoTest.java @@ -18,8 +18,8 @@ public class GithubRepoTest { , new PageModelPipeline() { @Override public void process(GithubRepo o, Task task) { - Assert.assertEquals(78, o.getStar()); - Assert.assertEquals(65, o.getFork()); + Assert.assertEquals(86, o.getStar()); + Assert.assertEquals(70, o.getFork()); } }, GithubRepo.class).setDownloader(new MockDownloader()).test("https://github.com/code4craft/webmagic"); } From cc3b787991a43bd9ea0c5f06851ec9815d56f00a Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Mon, 23 Sep 2013 13:12:19 +0800 Subject: [PATCH 11/12] [maven-release-plugin] prepare release webmagic-0.3.2 --- pom.xml | 4 ++-- webmagic-core/pom.xml | 2 +- webmagic-extension/pom.xml | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/pom.xml b/pom.xml index e2685a8a..ce86bff7 100644 --- a/pom.xml +++ b/pom.xml @@ -6,7 +6,7 @@ 7 us.codecraft - 0.3.2-SNAPSHOT + 0.3.2 4.0.0 pom @@ -36,7 +36,7 @@ scm:git:git@github.com:code4craft/webmagic.git scm:git:git@github.com:code4craft/webmagic.git git@github.com:code4craft/webmagic.git - HEAD + webmagic-0.3.2 diff --git a/webmagic-core/pom.xml b/webmagic-core/pom.xml index 2506d71a..ec8a90b4 100644 --- a/webmagic-core/pom.xml +++ b/webmagic-core/pom.xml @@ -3,7 +3,7 @@ us.codecraft webmagic-parent - 0.3.2-SNAPSHOT + 0.3.2 4.0.0 diff --git a/webmagic-extension/pom.xml b/webmagic-extension/pom.xml index 7befae77..816df53b 100644 --- a/webmagic-extension/pom.xml +++ b/webmagic-extension/pom.xml @@ -3,7 +3,7 @@ us.codecraft webmagic-parent - 0.3.2-SNAPSHOT + 0.3.2 4.0.0 From 4acbc19cee6dc81488493498811116a7fe0ff14b Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Mon, 23 Sep 2013 13:12:32 +0800 Subject: [PATCH 12/12] [maven-release-plugin] prepare for next development iteration --- pom.xml | 4 ++-- webmagic-core/pom.xml | 2 +- webmagic-extension/pom.xml | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/pom.xml b/pom.xml index ce86bff7..8d25a666 100644 --- a/pom.xml +++ b/pom.xml @@ -6,7 +6,7 @@ 7 us.codecraft - 0.3.2 + 0.3.3-SNAPSHOT 4.0.0 pom @@ -36,7 +36,7 @@ scm:git:git@github.com:code4craft/webmagic.git scm:git:git@github.com:code4craft/webmagic.git git@github.com:code4craft/webmagic.git - webmagic-0.3.2 + HEAD diff --git a/webmagic-core/pom.xml b/webmagic-core/pom.xml index ec8a90b4..f68114a3 100644 --- a/webmagic-core/pom.xml +++ b/webmagic-core/pom.xml @@ -3,7 +3,7 @@ us.codecraft webmagic-parent - 0.3.2 + 0.3.3-SNAPSHOT 4.0.0 diff --git a/webmagic-extension/pom.xml b/webmagic-extension/pom.xml index 816df53b..1e36b79a 100644 --- a/webmagic-extension/pom.xml +++ b/webmagic-extension/pom.xml @@ -3,7 +3,7 @@ us.codecraft webmagic-parent - 0.3.2 + 0.3.3-SNAPSHOT 4.0.0