diff --git a/pom.xml b/pom.xml index 7f49dc08..06ea7cd6 100644 --- a/pom.xml +++ b/pom.xml @@ -35,6 +35,7 @@ webmagic-core webmagic-extension/ + webmagic-samples diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/News163.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/News163.java index 8c0e32dc..2bc3f954 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/News163.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/News163.java @@ -2,9 +2,9 @@ package us.codecraft.webmagic.model.samples; import us.codecraft.webmagic.PagedModel; import us.codecraft.webmagic.Site; -import us.codecraft.webmagic.model.*; +import us.codecraft.webmagic.model.OOSpider; +import us.codecraft.webmagic.model.annotation.ComboExtract; import us.codecraft.webmagic.model.annotation.ExtractBy; -import us.codecraft.webmagic.model.annotation.ExtractBy2; import us.codecraft.webmagic.model.annotation.ExtractByUrl; import us.codecraft.webmagic.model.annotation.TargetUrl; import us.codecraft.webmagic.pipeline.ConsolePipeline; @@ -16,8 +16,8 @@ import java.util.List; /** * @author code4crafter@gmail.com
- * Date: 13-8-4
- * Time: 下午8:17
+ * Date: 13-8-4
+ * Time: 下午8:17
*/ @TargetUrl("http://news.163.com/\\d+/\\d+/\\d+/\\w+*.html") public class News163 implements PagedModel { @@ -28,8 +28,9 @@ public class News163 implements PagedModel { @ExtractByUrl(value = "http://news\\.163\\.com/\\d+/\\d+/\\d+/\\w+_(\\d+)\\.html", notNull = false) private String page; - @ExtractBy(value = "//div[@class=\"ep-pages\"]//a/@href", multi = true,notNull = false) - @ExtractBy2(value = "http://news\\.163\\.com/\\d+/\\d+/\\d+/\\w+_(\\d+)\\.html", type = ExtractBy2.Type.Regex) + @ComboExtract(value = {@ExtractBy("//div[@class=\"ep-pages\"]//a/@href"), + @ExtractBy(value = "http://news\\.163\\.com/\\d+/\\d+/\\d+/\\w+_(\\d+)\\.html", type = ExtractBy.Type.Regex)}, + multi = true, notNull = false) private List otherPage; @ExtractBy("//h1[@id=\"h1title\"]/text()")