diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java index 4332fa32..c1ecff3e 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java @@ -33,7 +33,7 @@ public class HttpClientDownloader implements Downloader { if (site.getAcceptStatCode().contains(statusCode)) { if (site.getEncoding() == null){ String value = httpResponse.getEntity().getContentType().getValue(); - site.setEncoding(new PlainText(value).r("charset=([^\\s]+)").toString()); + site.setEncoding(new PlainText(value).regex("charset=([^\\s]+)").toString()); } String content = IOUtils.toString(httpResponse.getEntity().getContent(), site.getEncoding()); diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/processor/SimplePageProcessor.java b/webmagic-core/src/main/java/us/codecraft/webmagic/processor/SimplePageProcessor.java index 3ffc9a32..0d524462 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/processor/SimplePageProcessor.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/processor/SimplePageProcessor.java @@ -28,13 +28,13 @@ public class SimplePageProcessor implements PageProcessor { @Override public void process(Page page) { - List requests = page.getHtml().as().rs(urlPattern).toStrings(); + List requests = page.getHtml().links().regex(urlPattern).toStrings(); //调用page.addTargetRequests()方法添加待抓取链接 page.addTargetRequests(requests); //xpath方式抽取 - page.putField("title", page.getHtml().x("//title")); + page.putField("title", page.getHtml().xpath("//title")); //sc表示使用Readability技术抽取正文 - page.putField("content", page.getHtml().sc()); + page.putField("content", page.getHtml().smartContent()); } @Override diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java index 3b3c80af..3cc84f79 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java @@ -18,12 +18,6 @@ public class Html extends PlainText { super(text); } - @Override - public Selectable x(String xpath) { - XpathSelector xpathSelector = SelectorFactory.getInstatnce().newXpathSelector(xpath); - return select(xpathSelector,strings); - } - @Override protected Selectable select(Selector selector, List strings) { List results = new ArrayList(); @@ -47,25 +41,19 @@ public class Html extends PlainText { } @Override - public Selectable sc() { + public Selectable smartContent() { SmartContentSelector smartContentSelector = SelectorFactory.getInstatnce().newSmartContentSelector(); return select(smartContentSelector,strings); } @Override - public Selectable a() { - XpathSelector xpathSelector = SelectorFactory.getInstatnce().newXpathSelector("//a/@href"); - return select(xpathSelector,strings); - } - - @Override - public Selectable as() { + public Selectable links() { XpathSelector xpathSelector = SelectorFactory.getInstatnce().newXpathSelector("//a/@href"); return selectList(xpathSelector,strings); } @Override - public Selectable xs(String xpath) { + public Selectable xpath(String xpath) { XpathSelector xpathSelector = SelectorFactory.getInstatnce().newXpathSelector(xpath); return selectList(xpathSelector, strings); } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/PlainText.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/PlainText.java index a11c9a29..935ababa 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/PlainText.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/PlainText.java @@ -25,38 +25,22 @@ public class PlainText implements Selectable { } @Override - public Selectable x(String xpath) { + public Selectable xpath(String xpath) { throw new UnsupportedOperationException(); } @Override - public Selectable xs(String xpath) { + public Selectable smartContent() { throw new UnsupportedOperationException(); } @Override - public Selectable sc() { + public Selectable links() { throw new UnsupportedOperationException(); } @Override - public Selectable a() { - throw new UnsupportedOperationException(); - } - - @Override - public Selectable as() { - throw new UnsupportedOperationException(); - } - - @Override - public Selectable r(String regex) { - RegexSelector regexSelector = SelectorFactory.getInstatnce().newRegexSelector(regex); - return select(regexSelector, strings); - } - - @Override - public Selectable rs(String regex) { + public Selectable regex(String regex) { RegexSelector regexSelector = SelectorFactory.getInstatnce().newRegexSelector(regex); return selectList(regexSelector, strings); } @@ -82,7 +66,7 @@ public class PlainText implements Selectable { } @Override - public Selectable rp(String regex, String replacement) { + public Selectable replace(String regex, String replacement) { ReplaceSelector replaceSelector = SelectorFactory.getInstatnce().newReplaceSelector(regex, replacement); return select(replaceSelector, strings); } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectable.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectable.java index 921e6c3f..630808d3 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectable.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectable.java @@ -9,51 +9,27 @@ import java.util.List; */ public interface Selectable { - /** - * select with xpath - * - * @param xpath - * @return new Selectable after extract - */ - public Selectable x(String xpath); - /** * select list with xpath * * @param xpath * @return new Selectable after extract */ - public Selectable xs(String xpath); + public Selectable xpath(String xpath); /** * select smart content with ReadAbility algorithm * * @return content */ - public Selectable sc(); - - /** - * select a link - * - * @return first link - */ - public Selectable a(); + public Selectable smartContent(); /** * select all links * * @return all links */ - public Selectable as(); - - - /** - * select with regex - * - * @param regex - * @return new Selectable after extract - */ - public Selectable r(String regex); + public Selectable links(); /** * select list with regex @@ -61,7 +37,7 @@ public interface Selectable { * @param regex * @return new Selectable after extract */ - public Selectable rs(String regex); + public Selectable regex(String regex); /** * replace with regex @@ -70,7 +46,7 @@ public interface Selectable { * @param replacement * @return new Selectable after extract */ - public Selectable rp(String regex, String replacement); + public Selectable replace(String regex, String replacement); /** * single string result diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/HtmlTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/HtmlTest.java index 6dacc983..fcdbfeff 100644 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/HtmlTest.java +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/HtmlTest.java @@ -14,7 +14,7 @@ public class HtmlTest { @Test public void testRegexSelector() { Html selectable = new Html("aaaaaaab"); - Assert.assertEquals("abbabbab", (selectable.r("(.*)").rp("aa(a)", "$1bb").toString())); + Assert.assertEquals("abbabbab", (selectable.regex("(.*)").replace("aa(a)", "$1bb").toString())); } } diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java index 8ee88859..bebbb83d 100644 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java @@ -1350,7 +1350,7 @@ public class XpathSelectorTest { @Test public void testOschina() { Html html1 = new Html(html); - Assert.assertEquals("再次吐槽easyui", html1.x(".//*[@class='QTitle']/h1/a").toString()); + Assert.assertEquals("再次吐槽easyui", html1.xpath(".//*[@class='QTitle']/h1/a").toString()); } } diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DiandianBlogProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DiandianBlogProcessor.java index a5b355cb..e5aafe7a 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DiandianBlogProcessor.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DiandianBlogProcessor.java @@ -17,20 +17,20 @@ public class DiandianBlogProcessor implements PageProcessor { @Override public void process(Page page) { - //a()表示提取链接,as()表示提取所有链接 + //a()表示提取链接,links()表示提取所有链接 //getHtml()返回Html对象,支持链式调用 - //r()表示用正则表达式提取一条内容,rs()表示提取多条内容 + //r()表示用正则表达式提取一条内容,regex()表示提取多条内容 //toString()表示取单条结果,toStrings()表示取多条 - List requests = page.getHtml().as().rs("(.*/post/.*)").toStrings(); + List requests = page.getHtml().links().regex("(.*/post/.*)").toStrings(); //使用page.addTargetRequests()方法将待抓取的链接加入队列 page.addTargetRequests(requests); //page.putField(key,value)将抽取的内容加入结果Map //x()和xs()使用xpath进行抽取 - page.putField("title", page.getHtml().x("//title").r("(.*?)\\|")); - //sc()使用readability技术直接抽取正文,对于规整的文本有比较好的抽取正确率 - page.putField("content", page.getHtml().sc()); - page.putField("date", page.getUrl().r("post/(\\d+-\\d+-\\d+)/")); - page.putField("id", page.getUrl().r("post/\\d+-\\d+-\\d+/(\\d+)")); + page.putField("title", page.getHtml().xpath("//title").regex("(.*?)\\|")); + //smartContent()使用readability技术直接抽取正文,对于规整的文本有比较好的抽取正确率 + page.putField("content", page.getHtml().smartContent()); + page.putField("date", page.getUrl().regex("post/(\\d+-\\d+-\\d+)/")); + page.putField("id", page.getUrl().regex("post/\\d+-\\d+-\\d+/(\\d+)")); } @Override diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DianpingProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DianpingProcessor.java index 2f28e6a3..7a211882 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DianpingProcessor.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DianpingProcessor.java @@ -15,13 +15,13 @@ import java.util.List; public class DianpingProcessor implements PageProcessor { @Override public void process(Page page) { - List requests = page.getHtml().as().rs(".*shop.*").toStrings(); + List requests = page.getHtml().links().regex(".*shop.*").toStrings(); page.addTargetRequests(requests); - requests = page.getHtml().rs(".*search/category/.*").toStrings(); + requests = page.getHtml().regex(".*search/category/.*").toStrings(); page.addTargetRequests(requests); if (page.getUrl().toString().contains("shop")) { - page.putField("title", page.getHtml().x("//h1[@class='shop-title']")); - page.putField("content", page.getHtml().sc()); + page.putField("title", page.getHtml().xpath("//h1[@class='shop-title']")); + page.putField("content", page.getHtml().smartContent()); } } diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DiaoyuwengProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DiaoyuwengProcessor.java index f5032ff7..13ed2e11 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DiaoyuwengProcessor.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DiaoyuwengProcessor.java @@ -18,15 +18,15 @@ public class DiaoyuwengProcessor implements PageProcessor { @Override public void process(Page page) { - List requests = page.getHtml().as().rs("(http://www\\.diaoyuweng\\.com/home\\.php\\?mod=space&uid=88304&do=thread&view=me&type=thread&order=dateline&from=space&page=\\d+)").toStrings(); + List requests = page.getHtml().links().regex("(http://www\\.diaoyuweng\\.com/home\\.php\\?mod=space&uid=88304&do=thread&view=me&type=thread&order=dateline&from=space&page=\\d+)").toStrings(); page.addTargetRequests(requests); - requests = page.getHtml().as().rs("(http://www\\.diaoyuweng\\.com/thread-\\d+-1-1.html)").toStrings(); + requests = page.getHtml().links().regex("(http://www\\.diaoyuweng\\.com/thread-\\d+-1-1.html)").toStrings(); page.addTargetRequests(requests); if (page.getUrl().toString().contains("thread")){ - page.putField("title", page.getHtml().x("//a[@id='thread_subject']")); - page.putField("content", page.getHtml().x("//div[@class='pcb']//tbody")); - page.putField("date",page.getHtml().r("发表于 (\\d{4}-\\d+-\\d+ \\d+:\\d+:\\d+)")); - page.putField("id",new PlainText("1000"+page.getUrl().r("http://www\\.diaoyuweng\\.com/thread-(\\d+)-1-1.html").toString())); + page.putField("title", page.getHtml().xpath("//a[@id='thread_subject']")); + page.putField("content", page.getHtml().xpath("//div[@class='pcb']//tbody")); + page.putField("date",page.getHtml().regex("发表于 (\\d{4}-\\d+-\\d+ \\d+:\\d+:\\d+)")); + page.putField("id",new PlainText("1000"+page.getUrl().regex("http://www\\.diaoyuweng\\.com/thread-(\\d+)-1-1.html").toString())); } } diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/F58PageProcesser.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/F58PageProcesser.java index 385e3f27..9d5140a2 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/F58PageProcesser.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/F58PageProcesser.java @@ -15,10 +15,10 @@ public class F58PageProcesser implements PageProcessor { @Override public void process(Page page) { - List strings = page.getHtml().rs("]*href=[\"']{1}(/yewu/.*?)[\"']{1}").toStrings(); + List strings = page.getHtml().regex("]*href=[\"']{1}(/yewu/.*?)[\"']{1}").toStrings(); page.addTargetRequests(strings); - page.putField("title",page.getHtml().r("(.*)")); - page.putField("body",page.getHtml().x("//dd[@class='w133']")); + page.putField("title",page.getHtml().regex("(.*)")); + page.putField("body",page.getHtml().xpath("//dd[@class='w133']")); } @Override diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/HuxiuProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/HuxiuProcessor.java index 1fa0b7b5..26c60cc2 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/HuxiuProcessor.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/HuxiuProcessor.java @@ -15,10 +15,10 @@ public class HuxiuProcessor implements PageProcessor { @Override public void process(Page page) { //http://progressdaily.diandian.com/post/2013-01-24/40046867275 - List requests = page.getHtml().rs("\"']*href=[\"']{1}([/]{0,1}article[^<>#\"']*?)[\"']{1}").toStrings(); + List requests = page.getHtml().regex("\"']*href=[\"']{1}([/]{0,1}article[^<>#\"']*?)[\"']{1}").toStrings(); page.addTargetRequests(requests); - page.putField("title",page.getHtml().x("//div[@class='neirong']//h1[@class='ph xs5']")); - page.putField("content",page.getHtml().sc()); + page.putField("title",page.getHtml().xpath("//div[@class='neirong']//h1[@class='ph xs5']")); + page.putField("content",page.getHtml().smartContent()); } @Override diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/KaichibaProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/KaichibaProcessor.java index 8ea4afe2..0a51b364 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/KaichibaProcessor.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/KaichibaProcessor.java @@ -13,10 +13,10 @@ public class KaichibaProcessor implements PageProcessor { @Override public void process(Page page) { //http://progressdaily.diandian.com/post/2013-01-24/40046867275 - int i = Integer.valueOf(page.getUrl().r("shop/(\\d+)").toString()) + 1; + int i = Integer.valueOf(page.getUrl().regex("shop/(\\d+)").toString()) + 1; page.addTargetRequest("http://kaichiba.com/shop/" + i); - page.putField("title",page.getHtml().x("//Title")); - page.putField("items", page.getHtml().xs("//li[@class=\"foodTitle\"]").rp("^\\s+", "").rp("\\s+$", "").rp(".*?", "")); + page.putField("title",page.getHtml().xpath("//Title")); + page.putField("items", page.getHtml().xpath("//li[@class=\"foodTitle\"]").replace("^\\s+", "").replace("\\s+$", "").replace(".*?", "")); } @Override diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/MeicanProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/MeicanProcessor.java index 98fe8de7..bd218113 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/MeicanProcessor.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/MeicanProcessor.java @@ -15,14 +15,14 @@ public class MeicanProcessor implements PageProcessor { @Override public void process(Page page) { //http://progressdaily.diandian.com/post/2013-01-24/40046867275 - List requests = page.getHtml().xs("//a[@class=\"area_link flat_btn\"]/@href").toStrings(); + List requests = page.getHtml().xpath("//a[@class=\"area_link flat_btn\"]/@href").toStrings(); if (requests.size() > 2) { requests = requests.subList(0, 2); } page.addTargetRequests(requests); - page.addTargetRequests(page.getHtml().as().rs("(.*/restaurant/[^#]+)").toStrings()); - page.putField("items", page.getHtml().xs("//ul[@class=\"dishes menu_dishes\"]/li/span[@class=\"name\"]")); - page.putField("prices", page.getHtml().xs("//ul[@class=\"dishes menu_dishes\"]/li/span[@class=\"price_outer\"]/span[@class=\"price\"]")); + page.addTargetRequests(page.getHtml().links().regex("(.*/restaurant/[^#]+)").toStrings()); + page.putField("items", page.getHtml().xpath("//ul[@class=\"dishes menu_dishes\"]/li/span[@class=\"name\"]")); + page.putField("prices", page.getHtml().xpath("//ul[@class=\"dishes menu_dishes\"]/li/span[@class=\"price_outer\"]/span[@class=\"price\"]")); } @Override diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/NjuBBSProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/NjuBBSProcessor.java index e4cc33cf..a7e9c9ba 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/NjuBBSProcessor.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/NjuBBSProcessor.java @@ -14,10 +14,10 @@ import java.util.List; public class NjuBBSProcessor implements PageProcessor { @Override public void process(Page page) { - List requests = page.getHtml().rs("]*href=(bbstcon\\?board=Pictures&file=[^>]*)").toStrings(); + List requests = page.getHtml().regex("]*href=(bbstcon\\?board=Pictures&file=[^>]*)").toStrings(); page.addTargetRequests(requests); - page.putField("title",page.getHtml().x("//div[@id='content']//h2/a")); - page.putField("content",page.getHtml().sc()); + page.putField("title",page.getHtml().xpath("//div[@id='content']//h2/a")); + page.putField("content",page.getHtml().smartContent()); } @Override diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/OschinaBlogPageProcesser.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/OschinaBlogPageProcesser.java index 0d6354d8..9293b41c 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/OschinaBlogPageProcesser.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/OschinaBlogPageProcesser.java @@ -15,11 +15,11 @@ public class OschinaBlogPageProcesser implements PageProcessor { @Override public void process(Page page) { - List strings = page.getHtml().as().r("(http://my\\.oschina\\.net)").toStrings(); + List strings = page.getHtml().links().regex("(http://my\\.oschina\\.net)").toStrings(); page.addTargetRequests(strings); - page.putField("title", page.getHtml().xs("//div[@class='BlogEntity']/div[@class='BlogTitle']/h1")); - page.putField("content", page.getHtml().sc()); - page.putField("author", page.getUrl().r("my\\.oschina\\.net/(\\w+)/blog/\\d+")); + page.putField("title", page.getHtml().xpath("//div[@class='BlogEntity']/div[@class='BlogTitle']/h1")); + page.putField("content", page.getHtml().smartContent()); + page.putField("author", page.getUrl().regex("my\\.oschina\\.net/(\\w+)/blog/\\d+")); } @Override diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/OschinaPageProcesser.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/OschinaPageProcesser.java index b708ec51..f88ce06d 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/OschinaPageProcesser.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/OschinaPageProcesser.java @@ -15,10 +15,10 @@ public class OschinaPageProcesser implements PageProcessor { @Override public void process(Page page) { - List strings = page.getHtml().rs("]*href=[\"']{1}(http://www\\.oschina\\.net/question/[\\w]+)[\"']{1}").toStrings(); + List strings = page.getHtml().regex("]*href=[\"']{1}(http://www\\.oschina\\.net/question/[\\w]+)[\"']{1}").toStrings(); page.addTargetRequests(strings); - page.putField("title", page.getHtml().x("//div[@class='QTitle']/h1/a")); - page.putField("content", page.getHtml().xs("//div[@class='Question']//div[@class='Content']/div[@class='detail']")); + page.putField("title", page.getHtml().xpath("//div[@class='QTitle']/h1/a")); + page.putField("content", page.getHtml().xpath("//div[@class='Question']//div[@class='Content']/div[@class='detail']")); } @Override diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/QzoneBlogProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/QzoneBlogProcessor.java index 400ebd5e..bf4dcc2a 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/QzoneBlogProcessor.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/QzoneBlogProcessor.java @@ -18,10 +18,10 @@ public class QzoneBlogProcessor implements PageProcessor { //http://b1.cnc.qzone.qq.com/cgi-bin/blognew/get_abs?hostUin=233017404&uin=233017404&blogType=0&statYear=2013&source=0&statYear=2013&g_tk=291639571&g_tk=291639571&reqInfo=7&pos=0&num=15&source=0&rand=0.46480297949165106 // &cateName=&cateHex=&statYear=2013&reqInfo=7&pos=0&num=15&sortType=0&source=0&rand=0.46480297949165106&g_tk=291639571&verbose=1&ref=qzone - List requests = page.getHtml().rs("]*href=[\"']{1}(http://17dujingdian\\.com/post/[^#]*?)[\"']{1}").toStrings(); + List requests = page.getHtml().regex("]*href=[\"']{1}(http://17dujingdian\\.com/post/[^#]*?)[\"']{1}").toStrings(); page.addTargetRequests(requests); - page.putField("title",page.getHtml().x("//div[@id='content']//h2/a")); - page.putField("content",page.getHtml().sc()); + page.putField("title",page.getHtml().xpath("//div[@id='content']//h2/a")); + page.putField("content",page.getHtml().smartContent()); } @Override diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/SinaBlogProcesser.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/SinaBlogProcesser.java index 5bc2fc68..bb77931f 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/SinaBlogProcesser.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/SinaBlogProcesser.java @@ -15,12 +15,12 @@ public class SinaBlogProcesser implements PageProcessor { @Override public void process(Page page) { - page.addTargetRequests(page.getHtml().as().rs("(http://blog\\.sina\\.com\\.cn/s/blog_.*)").toStrings()); - page.putField("title", page.getHtml().x("//div[@class='articalTitle']/h2")); - page.putField("content",page.getHtml().x("//div[@id='articlebody']//div[@class='articalContent']")); - page.putField("id",page.getUrl().r("http://blog\\.sina\\.com\\.cn/s/blog_(\\w+)")); - page.putField("date",page.getHtml().x("//div[@id='articlebody']//span[@class='time SG_txtc']").r("\\((.*)\\)")); -// page.putField("tags",page.getHtml().xs("//td[@class='blog_tag']/h3/a")); + page.addTargetRequests(page.getHtml().links().regex("(http://blog\\.sina\\.com\\.cn/s/blog_.*)").toStrings()); + page.putField("title", page.getHtml().xpath("//div[@class='articalTitle']/h2")); + page.putField("content",page.getHtml().xpath("//div[@id='articlebody']//div[@class='articalContent']")); + page.putField("id",page.getUrl().regex("http://blog\\.sina\\.com\\.cn/s/blog_(\\w+)")); + page.putField("date",page.getHtml().xpath("//div[@id='articlebody']//span[@class='time SG_txtc']").regex("\\((.*)\\)")); +// page.putField("tags",page.getHtml().xpath("//td[@class='blog_tag']/h3/a")); } @Override diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/TianyaPageProcesser.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/TianyaPageProcesser.java index a15ef74a..278657f0 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/TianyaPageProcesser.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/TianyaPageProcesser.java @@ -15,10 +15,10 @@ public class TianyaPageProcesser implements PageProcessor { @Override public void process(Page page) { - List strings = page.getHtml().rs("]*href=[\"']{1}(/post-free.*?\\.shtml)[\"']{1}").toStrings(); + List strings = page.getHtml().regex("]*href=[\"']{1}(/post-free.*?\\.shtml)[\"']{1}").toStrings(); page.addTargetRequests(strings); - page.putField("title", page.getHtml().x("//div[@id='post_head']//span[@class='s_title']//b")); - page.putField("body",page.getHtml().sc()); + page.putField("title", page.getHtml().xpath("//div[@id='post_head']//span[@class='s_title']//b")); + page.putField("body",page.getHtml().smartContent()); } @Override diff --git a/webmagic-samples/src/test/java/us/codecraft/webmagic/SpiderTest.java b/webmagic-samples/src/test/java/us/codecraft/webmagic/SpiderTest.java index 6293884c..681aac78 100644 --- a/webmagic-samples/src/test/java/us/codecraft/webmagic/SpiderTest.java +++ b/webmagic-samples/src/test/java/us/codecraft/webmagic/SpiderTest.java @@ -51,7 +51,7 @@ public class SpiderTest { /** * - * _hrefs = rs("]*href=[\"']{1}(/yewu/.*?)[\"']{1}") + * _hrefs = regex("]*href=[\"']{1}(/yewu/.*?)[\"']{1}") * title = r(""(.*)"") * body = x("//dd[@class='w133']") * @@ -72,7 +72,7 @@ public class SpiderTest { * body[r(_currentUrl).g(1)] = body[r(_currentUrl).g(1)] + (x("").r("",1,2).c()) * * body=body[r(_currentUrl).g(1)] - * tags[%] = (tags[%] + xs('')) . r('') + * tags[%] = (tags[%] + xpath('')) . r('') * * _targetUrls.add('' + x('').r('')) * _sourceUrls.add() @@ -114,7 +114,7 @@ public class SpiderTest { * content = t(_html) > c() * title = x(_html, 'asd@asd') > r('',1) * body[r(_currentUrl).g(1)] = body[r(_currentUrl).g(1)] + (x('') > r('',1,2) > c()) | x('') - * tags[%] = tags + xs('') > r('') + * tags[%] = tags + xpath('') > r('') * model.setTargetUrl(); * * _targetUrl = '' + x('') & r('')