From 84b897f83bcd9524bb57f0a5082fbe48bb6133cd Mon Sep 17 00:00:00 2001 From: "yihua.huang" Date: Sun, 13 Apr 2014 12:20:57 +0800 Subject: [PATCH] update AngularJSProcessor --- .../webmagic/selector/JsonPathSelector.java | 2 -- .../codecraft/webmagic/selector/JsonTest.java | 5 ++++ .../webmagic/samples/AngularJSProcessor.java | 28 +++++++++++++++++-- 3 files changed, 30 insertions(+), 5 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/JsonPathSelector.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/JsonPathSelector.java index 725dac50..f9083a8b 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/JsonPathSelector.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/JsonPathSelector.java @@ -1,7 +1,6 @@ package us.codecraft.webmagic.selector; import com.jayway.jsonpath.JsonPath; -import us.codecraft.webmagic.utils.Experimental; import java.util.ArrayList; import java.util.List; @@ -13,7 +12,6 @@ import java.util.List; * @author code4crafter@gmail.com
* @since 0.2.1 */ -@Experimental public class JsonPathSelector implements Selector { private String jsonPathStr; diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/selector/JsonTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/selector/JsonTest.java index 89afbb6f..f77e30d1 100644 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/selector/JsonTest.java +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/selector/JsonTest.java @@ -1,6 +1,7 @@ package us.codecraft.webmagic.selector; import org.junit.Test; +import us.codecraft.webmagic.Page; import static org.assertj.core.api.Assertions.assertThat; @@ -16,5 +17,9 @@ public class JsonTest { public void testRemovePadding() throws Exception { String name = new Json(text).removePadding("callback").jsonPath("$.name").get(); assertThat(name).isEqualTo("json"); + Page page = null; + + page.getJson().jsonPath("$.name").get(); + page.getJson().removePadding("callback").jsonPath("$.name").get(); } } diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/AngularJSProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/AngularJSProcessor.java index c861b036..18719bdc 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/AngularJSProcessor.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/AngularJSProcessor.java @@ -1,21 +1,43 @@ package us.codecraft.webmagic.samples; +import org.apache.commons.collections.CollectionUtils; import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.processor.PageProcessor; +import us.codecraft.webmagic.selector.JsonPathSelector; + +import java.util.List; /** - * @author yihua.huang@dianping.com + * @author code4crafter@gmail.com + * @since 0.5.0 */ -public class AngularJSProcessor implements PageProcessor{ +public class AngularJSProcessor implements PageProcessor { + + private Site site = Site.me(); + + private static final String ARITICALE_URL = "http://angularjs\\.cn/api/article/\\w+"; + + private static final String LIST_URL = "http://angularjs\\.cn/api/article/latest.*"; @Override public void process(Page page) { + if (page.getUrl().regex(LIST_URL).match()) { + List ids = new JsonPathSelector("$.data._id").selectList(page.getRawText()); + if (CollectionUtils.isNotEmpty(ids)) { + for (String id : ids) { + page.addTargetRequest("http://angularjs\\.cn/api/article/" + id); + } + } + } else { + page.putField("title", new JsonPathSelector("$.title").select(page.getRawText())); + page.putField("content", new JsonPathSelector("$.content").select(page.getRawText())); + } } @Override public Site getSite() { - return null; + return site; } }