diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java index f3d29aa9..493c7629 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java @@ -59,7 +59,7 @@ public class Html extends PlainText { @Override public Selectable xpath(String xpath) { - XpathSelector xpathSelector = Selectors.xpath(xpath); + XsoupSelector xpathSelector = new XsoupSelector(xpath); return selectList(xpathSelector, strings); } diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/utils/ExtractorUtils.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/utils/ExtractorUtils.java index 5c6ebbf8..10996362 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/utils/ExtractorUtils.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/utils/ExtractorUtils.java @@ -1,10 +1,7 @@ package us.codecraft.webmagic.utils; import us.codecraft.webmagic.model.annotation.ExtractBy; -import us.codecraft.webmagic.selector.CssSelector; -import us.codecraft.webmagic.selector.RegexSelector; -import us.codecraft.webmagic.selector.Selector; -import us.codecraft.webmagic.selector.XpathSelector; +import us.codecraft.webmagic.selector.*; import java.util.ArrayList; import java.util.List; @@ -27,10 +24,10 @@ public class ExtractorUtils { selector = new RegexSelector(value); break; case XPath: - selector = new XpathSelector(value); + selector = new XsoupSelector(value); break; default: - selector = new XpathSelector(value); + selector = new XsoupSelector(value); } return selector; } diff --git a/webmagic-samples/pom.xml b/webmagic-samples/pom.xml index 35ddcaa4..a349a68e 100644 --- a/webmagic-samples/pom.xml +++ b/webmagic-samples/pom.xml @@ -5,7 +5,7 @@ webmagic-parent us.codecraft - 0.2.1 + 0.2.2-SNAPSHOT 4.0.0 diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DiaoyuwengProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DiaoyuwengProcessor.java index 115f1834..3ceba0af 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DiaoyuwengProcessor.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DiaoyuwengProcessor.java @@ -2,6 +2,7 @@ package us.codecraft.webmagic.samples; import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Site; +import us.codecraft.webmagic.Spider; import us.codecraft.webmagic.processor.PageProcessor; import us.codecraft.webmagic.selector.PlainText; @@ -24,7 +25,7 @@ public class DiaoyuwengProcessor implements PageProcessor { page.addTargetRequests(requests); if (page.getUrl().toString().contains("thread")){ page.putField("title", page.getHtml().xpath("//a[@id='thread_subject']")); - page.putField("content", page.getHtml().xpath("//div[@class='pcb']//tbody")); + page.putField("content", page.getHtml().xpath("//div[@class='pcb']//tbody/tidyText()")); page.putField("date",page.getHtml().regex("发表于 (\\d{4}-\\d+-\\d+ \\d+:\\d+:\\d+)")); page.putField("id",new PlainText("1000"+page.getUrl().regex("http://www\\.diaoyuweng\\.com/thread-(\\d+)-1-1.html").toString())); } @@ -38,4 +39,8 @@ public class DiaoyuwengProcessor implements PageProcessor { } return site; } + + public static void main(String[] args) { + Spider.create(new DiaoyuwengProcessor()).run(); + } } diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/F58PageProcesser.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/F58PageProcesser.java index 4ffe127b..7124a8c5 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/F58PageProcesser.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/F58PageProcesser.java @@ -2,6 +2,7 @@ package us.codecraft.webmagic.samples; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Page; +import us.codecraft.webmagic.Spider; import us.codecraft.webmagic.processor.PageProcessor; import java.util.List; @@ -15,14 +16,18 @@ public class F58PageProcesser implements PageProcessor { @Override public void process(Page page) { - List strings = page.getHtml().regex("]*href=[\"']{1}(/yewu/.*?)[\"']{1}").all(); + List strings = page.getHtml().links().regex(".*/yewu/.*").all(); page.addTargetRequests(strings); page.putField("title",page.getHtml().regex("(.*)")); - page.putField("body",page.getHtml().xpath("//dd[@class='w133']")); + page.putField("body",page.getHtml().xpath("//dd")); } @Override public Site getSite() { return Site.me().setDomain("sh.58.com").addStartUrl("http://sh.58.com/"); //To change body of implemented methods use File | Settings | File Templates. } + + public static void main(String[] args) { + Spider.create(new F58PageProcesser()).run(); + } } diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/HuxiuProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/HuxiuProcessor.java index 89b74d63..4ac93107 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/HuxiuProcessor.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/HuxiuProcessor.java @@ -2,6 +2,7 @@ package us.codecraft.webmagic.samples; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Page; +import us.codecraft.webmagic.Spider; import us.codecraft.webmagic.processor.PageProcessor; import java.util.List; @@ -26,4 +27,8 @@ public class HuxiuProcessor implements PageProcessor { return Site.me().setDomain("www.huxiu.com").addStartUrl("http://www.huxiu.com/"). setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31"); } + + public static void main(String[] args) { + Spider.create(new HuxiuProcessor()).run(); + } }