diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/processor/example/GithubRepoPageProcesser.java b/webmagic-core/src/main/java/us/codecraft/webmagic/processor/example/GithubRepoPageProcesser.java new file mode 100644 index 00000000..0e7e3b92 --- /dev/null +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/processor/example/GithubRepoPageProcesser.java @@ -0,0 +1,36 @@ +package us.codecraft.webmagic.processor.example; + +import us.codecraft.webmagic.Page; +import us.codecraft.webmagic.Site; +import us.codecraft.webmagic.Spider; +import us.codecraft.webmagic.processor.PageProcessor; + +/** + * @author code4crafter@gmail.com
+ * @since 0.3.2 + */ +public class GithubRepoPageProcesser implements PageProcessor { + + private Site site = Site.me().addStartUrl("https://github.com/code4craft").setRetryTimes(3).setSleepTime(100); + + @Override + public void process(Page page) { + page.addTargetRequests(page.getHtml().links().regex("(https://github\\.com/\\w+/\\w+)").all()); + page.putField("author", page.getUrl().regex("https://github\\.com/(\\w+)/.*").toString()); + page.putField("name", page.getHtml().xpath("//h1[@class='entry-title public']/strong/a/text()").toString()); + if (page.getResultItems().get("name")==null){ + //skip this page + page.setSkip(true); + } + page.putField("readme", page.getHtml().xpath("//div[@id='readme']/tidyText()")); + } + + @Override + public Site getSite() { + return site; + } + + public static void main(String[] args) { + Spider.create(new GithubRepoPageProcesser()).thread(5).run(); + } +} diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/processor/example/OschinaBlogPageProcesser.java b/webmagic-core/src/main/java/us/codecraft/webmagic/processor/example/OschinaBlogPageProcesser.java new file mode 100644 index 00000000..fa8dab6d --- /dev/null +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/processor/example/OschinaBlogPageProcesser.java @@ -0,0 +1,39 @@ +package us.codecraft.webmagic.processor.example; + +import us.codecraft.webmagic.Page; +import us.codecraft.webmagic.Site; +import us.codecraft.webmagic.Spider; +import us.codecraft.webmagic.processor.PageProcessor; + +import java.util.List; + +/** + * @author code4crafter@gmail.com
+ */ +public class OschinaBlogPageProcesser implements PageProcessor { + + private Site site = Site.me().setDomain("my.oschina.net").addStartUrl("http://my.oschina.net/flashsword/blog"); + + @Override + public void process(Page page) { + List links = page.getHtml().links().regex("http://my\\.oschina\\.net/flashsword/blog/\\d+").all(); + page.addTargetRequests(links); + page.putField("title", page.getHtml().xpath("//div[@class='BlogEntity']/div[@class='BlogTitle']/h1/text()").toString()); + if (page.getResultItems().get("title") == null) { + //skip this page + page.setSkip(true); + } + page.putField("content", page.getHtml().xpath("//div[@class='BlogContent']/tidyText()").toString()); + page.putField("tags", page.getHtml().xpath("//div[@class='BlogTags']/a/text()").all()); + } + + @Override + public Site getSite() { + return site; + + } + + public static void main(String[] args) { + Spider.create(new OschinaBlogPageProcesser()).thread(2).run(); + } +} diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/example/GithubRepo.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/example/GithubRepo.java index d9501154..58441cbc 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/example/GithubRepo.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/example/GithubRepo.java @@ -1,6 +1,9 @@ package us.codecraft.webmagic.example; +import us.codecraft.webmagic.Site; +import us.codecraft.webmagic.model.ConsolePageModelPipeline; import us.codecraft.webmagic.model.HasKey; +import us.codecraft.webmagic.model.OOSpider; import us.codecraft.webmagic.model.annotation.ExtractBy; import us.codecraft.webmagic.model.annotation.ExtractByUrl; import us.codecraft.webmagic.model.annotation.HelpUrl; @@ -10,6 +13,7 @@ import java.util.List; /** * @author code4crafter@gmail.com
+ * @since 0.3.2 */ @TargetUrl("https://github.com/\\w+/\\w+") @HelpUrl({"https://github.com/\\w+\\?tab=repositories", "https://github.com/\\w+", "https://github.com/explore/*"}) @@ -27,15 +31,20 @@ public class GithubRepo implements HasKey { @ExtractBy(value = "//div[@class='repository-lang-stats']//li//span[@class='lang']/text()", multi = true) private List language; - @ExtractBy("//ul[@class='pagehead-actions']/li[2]//a[@class='social-count js-social-count']/text()") + @ExtractBy("//ul[@class='pagehead-actions']/li[1]//a[@class='social-count js-social-count']/text()") private int star; - @ExtractBy("//ul[@class='pagehead-actions']/li[3]//a[@class='social-count']/text()") + @ExtractBy("//ul[@class='pagehead-actions']/li[2]//a[@class='social-count']/text()") private int fork; @ExtractByUrl private String url; + public static void main(String[] args) { + OOSpider.create(Site.me().addStartUrl("https://github.com/code4craft").setSleepTime(100) + , new ConsolePageModelPipeline(), GithubRepo.class).thread(10).run(); + } + @Override public String key() { return author + ":" + name; diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/example/OschinaBlog.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/example/OschinaBlog.java index 703d6a4f..1545f885 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/example/OschinaBlog.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/example/OschinaBlog.java @@ -1,38 +1,38 @@ package us.codecraft.webmagic.example; -import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Site; -import us.codecraft.webmagic.model.AfterExtractor; import us.codecraft.webmagic.model.OOSpider; import us.codecraft.webmagic.model.annotation.ExtractBy; import us.codecraft.webmagic.model.annotation.Formatter; import us.codecraft.webmagic.model.annotation.TargetUrl; import us.codecraft.webmagic.pipeline.JsonFilePageModelPipeline; +import java.util.Date; import java.util.List; /** * @author code4crafter@gmail.com
+ * @since 0.3.2 */ @TargetUrl("http://my.oschina.net/flashsword/blog/\\d+") -public class OschinaBlog implements AfterExtractor{ +public class OschinaBlog { @ExtractBy("//title/text()") private String title; - @ExtractBy(value = "div.BlogContent",type = ExtractBy.Type.Css) + @ExtractBy(value = "div.BlogContent", type = ExtractBy.Type.Css) private String content; @ExtractBy(value = "//div[@class='BlogTags']/a/text()", multi = true) private List tags; - @Formatter("YYYY-MM-dd HH:mm") + @Formatter("yyyy-MM-dd HH:mm") @ExtractBy("//div[@class='BlogStat']/regex('\\d+-\\d+-\\d+\\s+\\d+:\\d+')") - private String date; + private Date date; public static void main(String[] args) { OOSpider.create(Site.me().addStartUrl("http://my.oschina.net/flashsword/blog") - ,new JsonFilePageModelPipeline("/data/webmagic/"), OschinaBlog.class).run(); + , new JsonFilePageModelPipeline("/data/webmagic/"), OschinaBlog.class).run(); } public String getTitle() { @@ -47,13 +47,8 @@ public class OschinaBlog implements AfterExtractor{ return tags; } -// public Date getDate() { -// return date; -// } - - @Override - public void afterProcess(Page page) { - System.out.println(date); - System.out.println(title); + public Date getDate() { + return date; } + } diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java index cd3e72b4..c78bd31e 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelExtractor.java @@ -105,7 +105,8 @@ class PageModelExtractor { Formatter formatter = field.getAnnotation(Formatter.class); if (formatter != null) { if (!formatter.formatter().equals(ObjectFormatter.class)) { - return initFormatter(formatter.formatter()); + ObjectFormatter objectFormatter = initFormatter(formatter.formatter()); + objectFormatter.initParam(formatter.value()); } } return initFormatter(ObjectFormatters.get(fieldClazz)); @@ -311,6 +312,9 @@ class PageModelExtractor { } if (fieldExtractor.getObjectFormatter() != null) { Object converted = convert(value, fieldExtractor.getObjectFormatter()); + if (converted == null && fieldExtractor.isNotNull()) { + return null; + } setField(o, fieldExtractor, converted); } else { setField(o, fieldExtractor, value); @@ -332,7 +336,11 @@ class PageModelExtractor { private Object convert(String value, ObjectFormatter objectFormatter) { try { - return objectFormatter.format(value); + Object format = objectFormatter.format(value); + if (logger.isDebugEnabled()) { + logger.debug("String " + value + " is converted to " + format); + } + return format; } catch (Exception e) { logger.error("convert " + value + " to " + objectFormatter.clazz() + " error!", e); } @@ -351,6 +359,9 @@ class PageModelExtractor { } private void setField(Object o, FieldExtractor fieldExtractor, Object value) throws IllegalAccessException, InvocationTargetException { + if (value==null){ + return; + } if (fieldExtractor.getSetterMethod() != null) { fieldExtractor.getSetterMethod().invoke(o, value); } diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/formatter/DateFormatter.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/formatter/DateFormatter.java index 0ad0302b..b0f6e771 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/model/formatter/DateFormatter.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/model/formatter/DateFormatter.java @@ -10,7 +10,7 @@ import java.util.Date; */ public class DateFormatter implements ObjectFormatter { - private String[] datePatterns = new String[]{"YYYY-MM-dd HH:mm"}; + private String[] datePatterns = new String[]{"yyyy-MM-dd HH:mm"}; @Override public Date format(String raw) throws Exception { diff --git a/webmagic-extension/src/test/java/us/codecraft/webmagic/formatter/DateFormatterTest.java b/webmagic-extension/src/test/java/us/codecraft/webmagic/formatter/DateFormatterTest.java new file mode 100644 index 00000000..a621e2dc --- /dev/null +++ b/webmagic-extension/src/test/java/us/codecraft/webmagic/formatter/DateFormatterTest.java @@ -0,0 +1,20 @@ +package us.codecraft.webmagic.formatter; + +import org.junit.Test; +import us.codecraft.webmagic.model.formatter.DateFormatter; + +import java.util.Date; + +/** + * @author code4crafter@gmail.com + */ +public class DateFormatterTest { + + @Test + public void testDateFormatter() throws Exception { + DateFormatter dateFormatter = new DateFormatter(); + dateFormatter.initParam(new String[]{"yyyy-MM-dd HH:mm"}); + Date format = dateFormatter.format("2013-09-10 22:11"); + System.out.println(format); + } +} diff --git a/webmagic-extension/src/test/resouces/log4j.xml b/webmagic-extension/src/test/resouces/log4j.xml new file mode 100644 index 00000000..a58e889b --- /dev/null +++ b/webmagic-extension/src/test/resouces/log4j.xml @@ -0,0 +1,31 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +