diff --git a/pom.xml b/pom.xml index 2309a15d..81c57671 100644 --- a/pom.xml +++ b/pom.xml @@ -88,7 +88,7 @@ us.codecraft xsoup - 0.2.4-SNAPSHOT + 0.2.4 com.alibaba diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/MamacnPageProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/MamacnPageProcessor.java index c639b638..22ae5eb4 100644 --- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/MamacnPageProcessor.java +++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/MamacnPageProcessor.java @@ -10,6 +10,7 @@ import us.codecraft.webmagic.selector.Selectable; import java.io.FileNotFoundException; import java.io.UnsupportedEncodingException; +import java.util.List; /** * @author code4crafer@gmail.com @@ -20,11 +21,14 @@ public class MamacnPageProcessor implements PageProcessor { @Override public void process(Page page) { - Selectable images = page.getHtml().xpath("//ul[@id=ma-thumb-list]/li"); - page.putField("img", images.xpath("//div[@class=picList]/div[@class=pre]/div[@class=npic]//img/@src").get()); - page.putField("title", page.getHtml().xpath("//div[@class=picList]/div[@class=pre]/div[@class=npic]//img/@alt").get()); - page.putField("url", page.getUrl().toString()); - if (page.getResultItems().get("title") == null) { + List nodes = page.getHtml().xpath("//ul[@id=ma-thumb-list]/li").nodes(); + StringBuilder accum = new StringBuilder(); + for (Selectable node : nodes) { + accum.append("img:").append(node.xpath("//a/@href").get()).append("\n"); + accum.append("title:").append(node.xpath("//img/@alt").get()).append("\n"); + } + page.putField("",accum.toString()); + if (accum.length() == 0) { page.setSkip(true); } page.addTargetRequests(page.getHtml().links().regex("http://www\\.mama\\.cn/photo/.*\\.html").all());