From d7cd9e5747859b41cc5d97fbebfc80bdc88ad78b Mon Sep 17 00:00:00 2001
From: "yihua.huang" <code4crafter@gmail.com>
Date: Mon, 2 Sep 2013 11:56:01 +0800
Subject: [PATCH] update pom

---
 .../main/java/us/codecraft/webmagic/selector/Html.java   | 2 +-
 .../java/us/codecraft/webmagic/utils/ExtractorUtils.java | 9 +++------
 webmagic-samples/pom.xml                                 | 2 +-
 .../codecraft/webmagic/samples/DiaoyuwengProcessor.java  | 7 ++++++-
 .../us/codecraft/webmagic/samples/F58PageProcesser.java  | 9 +++++++--
 .../us/codecraft/webmagic/samples/HuxiuProcessor.java    | 5 +++++
 6 files changed, 23 insertions(+), 11 deletions(-)

diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java
index f3d29aa9..493c7629 100644
--- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java
+++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Html.java
@@ -59,7 +59,7 @@ public class Html extends PlainText {
 
     @Override
     public Selectable xpath(String xpath) {
-        XpathSelector xpathSelector = Selectors.xpath(xpath);
+        XsoupSelector xpathSelector = new XsoupSelector(xpath);
         return selectList(xpathSelector, strings);
     }
 
diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/utils/ExtractorUtils.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/utils/ExtractorUtils.java
index 5c6ebbf8..10996362 100644
--- a/webmagic-extension/src/main/java/us/codecraft/webmagic/utils/ExtractorUtils.java
+++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/utils/ExtractorUtils.java
@@ -1,10 +1,7 @@
 package us.codecraft.webmagic.utils;
 
 import us.codecraft.webmagic.model.annotation.ExtractBy;
-import us.codecraft.webmagic.selector.CssSelector;
-import us.codecraft.webmagic.selector.RegexSelector;
-import us.codecraft.webmagic.selector.Selector;
-import us.codecraft.webmagic.selector.XpathSelector;
+import us.codecraft.webmagic.selector.*;
 
 import java.util.ArrayList;
 import java.util.List;
@@ -27,10 +24,10 @@ public class ExtractorUtils {
                 selector = new RegexSelector(value);
                 break;
             case XPath:
-                selector = new XpathSelector(value);
+                selector = new XsoupSelector(value);
                 break;
             default:
-                selector = new XpathSelector(value);
+                selector = new XsoupSelector(value);
         }
         return selector;
     }
diff --git a/webmagic-samples/pom.xml b/webmagic-samples/pom.xml
index 35ddcaa4..a349a68e 100644
--- a/webmagic-samples/pom.xml
+++ b/webmagic-samples/pom.xml
@@ -5,7 +5,7 @@
     <parent>
         <artifactId>webmagic-parent</artifactId>
         <groupId>us.codecraft</groupId>
-        <version>0.2.1</version>
+        <version>0.2.2-SNAPSHOT</version>
     </parent>
     <modelVersion>4.0.0</modelVersion>
 
diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DiaoyuwengProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DiaoyuwengProcessor.java
index 115f1834..3ceba0af 100644
--- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DiaoyuwengProcessor.java
+++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/DiaoyuwengProcessor.java
@@ -2,6 +2,7 @@ package us.codecraft.webmagic.samples;
 
 import us.codecraft.webmagic.Page;
 import us.codecraft.webmagic.Site;
+import us.codecraft.webmagic.Spider;
 import us.codecraft.webmagic.processor.PageProcessor;
 import us.codecraft.webmagic.selector.PlainText;
 
@@ -24,7 +25,7 @@ public class DiaoyuwengProcessor implements PageProcessor {
         page.addTargetRequests(requests);
         if (page.getUrl().toString().contains("thread")){
             page.putField("title", page.getHtml().xpath("//a[@id='thread_subject']"));
-            page.putField("content", page.getHtml().xpath("//div[@class='pcb']//tbody"));
+            page.putField("content", page.getHtml().xpath("//div[@class='pcb']//tbody/tidyText()"));
             page.putField("date",page.getHtml().regex("发表于 (\\d{4}-\\d+-\\d+ \\d+:\\d+:\\d+)"));
             page.putField("id",new PlainText("1000"+page.getUrl().regex("http://www\\.diaoyuweng\\.com/thread-(\\d+)-1-1.html").toString()));
         }
@@ -38,4 +39,8 @@ public class DiaoyuwengProcessor implements PageProcessor {
         }
         return site;
     }
+
+    public static void main(String[] args) {
+        Spider.create(new DiaoyuwengProcessor()).run();
+    }
 }
diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/F58PageProcesser.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/F58PageProcesser.java
index 4ffe127b..7124a8c5 100644
--- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/F58PageProcesser.java
+++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/F58PageProcesser.java
@@ -2,6 +2,7 @@ package us.codecraft.webmagic.samples;
 
 import us.codecraft.webmagic.Site;
 import us.codecraft.webmagic.Page;
+import us.codecraft.webmagic.Spider;
 import us.codecraft.webmagic.processor.PageProcessor;
 
 import java.util.List;
@@ -15,14 +16,18 @@ public class F58PageProcesser implements PageProcessor {
 
     @Override
     public void process(Page page) {
-        List<String> strings = page.getHtml().regex("<a[^<>]*href=[\"']{1}(/yewu/.*?)[\"']{1}").all();
+        List<String> strings = page.getHtml().links().regex(".*/yewu/.*").all();
         page.addTargetRequests(strings);
         page.putField("title",page.getHtml().regex("<title>(.*)</title>"));
-        page.putField("body",page.getHtml().xpath("//dd[@class='w133']"));
+        page.putField("body",page.getHtml().xpath("//dd"));
     }
 
     @Override
     public Site getSite() {
         return Site.me().setDomain("sh.58.com").addStartUrl("http://sh.58.com/");  //To change body of implemented methods use File | Settings | File Templates.
     }
+
+    public static void main(String[] args) {
+        Spider.create(new F58PageProcesser()).run();
+    }
 }
diff --git a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/HuxiuProcessor.java b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/HuxiuProcessor.java
index 89b74d63..4ac93107 100644
--- a/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/HuxiuProcessor.java
+++ b/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/HuxiuProcessor.java
@@ -2,6 +2,7 @@ package us.codecraft.webmagic.samples;
 
 import us.codecraft.webmagic.Site;
 import us.codecraft.webmagic.Page;
+import us.codecraft.webmagic.Spider;
 import us.codecraft.webmagic.processor.PageProcessor;
 
 import java.util.List;
@@ -26,4 +27,8 @@ public class HuxiuProcessor implements PageProcessor {
         return Site.me().setDomain("www.huxiu.com").addStartUrl("http://www.huxiu.com/").
                 setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31");
     }
+
+    public static void main(String[] args) {
+        Spider.create(new HuxiuProcessor()).run();
+    }
 }